* [dpdk-dev] [PATCH v3 2/5] event/octeontx2: use opposite bucket to store current chunk
2019-11-22 15:44 [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition pbhagavatula
@ 2019-11-22 15:44 ` pbhagavatula
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 3/5] event/octeontx2: improve chunk pool performance pbhagavatula
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: pbhagavatula @ 2019-11-22 15:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Since TIM buckets are always aligned to 32B and our cache line size being
128B, we will always have a cache miss when reading current_chunk pointer.
Avoid the cache miss by storing the current_chunk pointer in the bucket
opposite to the current bucket.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_worker.h | 69 ++++++++++++++---------
1 file changed, 41 insertions(+), 28 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index c896b5433..7b771fbca 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -115,20 +115,29 @@ tim_bkt_clr_nent(struct otx2_tim_bkt *bktp)
return __atomic_and_fetch(&bktp->w1, v, __ATOMIC_ACQ_REL);
}
-static __rte_always_inline struct otx2_tim_bkt *
+static __rte_always_inline void
tim_get_target_bucket(struct otx2_tim_ring * const tim_ring,
- const uint32_t rel_bkt, const uint8_t flag)
+ const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
+ struct otx2_tim_bkt **mirr_bkt, const uint8_t flag)
{
const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
uint32_t bucket = rte_reciprocal_divide_u64(bkt_cyc,
&tim_ring->fast_div) + rel_bkt;
+ uint32_t mirr_bucket = 0;
- if (flag & OTX2_TIM_BKT_MOD)
+ if (flag & OTX2_TIM_BKT_MOD) {
bucket = bucket % tim_ring->nb_bkts;
- if (flag & OTX2_TIM_BKT_AND)
+ mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) %
+ tim_ring->nb_bkts;
+ }
+ if (flag & OTX2_TIM_BKT_AND) {
bucket = bucket & (tim_ring->nb_bkts - 1);
+ mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) &
+ (tim_ring->nb_bkts - 1);
+ }
- return &tim_ring->bkt[bucket];
+ *bkt = &tim_ring->bkt[bucket];
+ *mirr_bkt = &tim_ring->bkt[mirr_bucket];
}
static struct otx2_tim_ent *
@@ -153,6 +162,7 @@ tim_clr_bkt(struct otx2_tim_ring * const tim_ring,
static struct otx2_tim_ent *
tim_refill_chunk(struct otx2_tim_bkt * const bkt,
+ struct otx2_tim_bkt * const mirr_bkt,
struct otx2_tim_ring * const tim_ring)
{
struct otx2_tim_ent *chunk;
@@ -162,8 +172,8 @@ tim_refill_chunk(struct otx2_tim_bkt * const bkt,
(void **)&chunk)))
return NULL;
if (bkt->nb_entry) {
- *(uint64_t *)(((struct otx2_tim_ent *)(uintptr_t)
- bkt->current_chunk) +
+ *(uint64_t *)(((struct otx2_tim_ent *)
+ mirr_bkt->current_chunk) +
tim_ring->nb_chunk_slots) =
(uintptr_t)chunk;
} else {
@@ -180,6 +190,7 @@ tim_refill_chunk(struct otx2_tim_bkt * const bkt,
static struct otx2_tim_ent *
tim_insert_chunk(struct otx2_tim_bkt * const bkt,
+ struct otx2_tim_bkt * const mirr_bkt,
struct otx2_tim_ring * const tim_ring)
{
struct otx2_tim_ent *chunk;
@@ -190,7 +201,7 @@ tim_insert_chunk(struct otx2_tim_bkt * const bkt,
*(uint64_t *)(chunk + tim_ring->nb_chunk_slots) = 0;
if (bkt->nb_entry) {
*(uint64_t *)(((struct otx2_tim_ent *)(uintptr_t)
- bkt->current_chunk) +
+ mirr_bkt->current_chunk) +
tim_ring->nb_chunk_slots) = (uintptr_t)chunk;
} else {
bkt->first_chunk = (uintptr_t)chunk;
@@ -205,14 +216,15 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
const struct otx2_tim_ent * const pent,
const uint8_t flags)
{
+ struct otx2_tim_bkt *mirr_bkt;
struct otx2_tim_ent *chunk;
struct otx2_tim_bkt *bkt;
uint64_t lock_sema;
int16_t rem;
- bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags);
-
__retry:
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -232,7 +244,7 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
: [hbt] "=&r" (hbt_state)
: [w1] "r" ((&bkt->w1))
: "memory"
- );
+ );
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
@@ -246,15 +258,14 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
}
}
}
-
/* Insert the work. */
rem = tim_bkt_fetch_rem(lock_sema);
if (!rem) {
if (flags & OTX2_TIM_ENA_FB)
- chunk = tim_refill_chunk(bkt, tim_ring);
+ chunk = tim_refill_chunk(bkt, mirr_bkt, tim_ring);
if (flags & OTX2_TIM_ENA_DFB)
- chunk = tim_insert_chunk(bkt, tim_ring);
+ chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring);
if (unlikely(chunk == NULL)) {
bkt->chunk_remainder = 0;
@@ -264,10 +275,10 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
tim->state = RTE_EVENT_TIMER_ERROR;
return -ENOMEM;
}
- bkt->current_chunk = (uintptr_t)chunk;
+ mirr_bkt->current_chunk = (uintptr_t)chunk;
bkt->chunk_remainder = tim_ring->nb_chunk_slots - 1;
} else {
- chunk = (struct otx2_tim_ent *)(uintptr_t)bkt->current_chunk;
+ chunk = (struct otx2_tim_ent *)mirr_bkt->current_chunk;
chunk += tim_ring->nb_chunk_slots - rem;
}
@@ -291,13 +302,14 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
const struct otx2_tim_ent * const pent,
const uint8_t flags)
{
+ struct otx2_tim_bkt *mirr_bkt;
struct otx2_tim_ent *chunk;
struct otx2_tim_bkt *bkt;
uint64_t lock_sema;
int16_t rem;
__retry:
- bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -317,7 +329,7 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
: [hbt] "=&r" (hbt_state)
: [w1] "r" ((&bkt->w1))
: "memory"
- );
+ );
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
@@ -358,9 +370,9 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
} else if (!rem) {
/* Only one thread can be here*/
if (flags & OTX2_TIM_ENA_FB)
- chunk = tim_refill_chunk(bkt, tim_ring);
+ chunk = tim_refill_chunk(bkt, mirr_bkt, tim_ring);
if (flags & OTX2_TIM_ENA_DFB)
- chunk = tim_insert_chunk(bkt, tim_ring);
+ chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring);
if (unlikely(chunk == NULL)) {
tim_bkt_set_rem(bkt, 0);
@@ -375,11 +387,11 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
(-tim_bkt_fetch_rem(lock_sema)))
lock_sema = __atomic_load_n(&bkt->w1, __ATOMIC_ACQUIRE);
- bkt->current_chunk = (uintptr_t)chunk;
+ mirr_bkt->current_chunk = (uintptr_t)chunk;
__atomic_store_n(&bkt->chunk_remainder,
tim_ring->nb_chunk_slots - 1, __ATOMIC_RELEASE);
} else {
- chunk = (struct otx2_tim_ent *)bkt->current_chunk;
+ chunk = (struct otx2_tim_ent *)mirr_bkt->current_chunk;
chunk += tim_ring->nb_chunk_slots - rem;
*chunk = *pent;
}
@@ -420,6 +432,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
const uint16_t nb_timers, const uint8_t flags)
{
struct otx2_tim_ent *chunk = NULL;
+ struct otx2_tim_bkt *mirr_bkt;
struct otx2_tim_bkt *bkt;
uint16_t chunk_remainder;
uint16_t index = 0;
@@ -428,7 +441,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
uint8_t lock_cnt;
__retry:
- bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
/* Only one thread beyond this. */
lock_sema = tim_bkt_inc_lock(bkt);
@@ -477,7 +490,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
crem = tim_ring->nb_chunk_slots - chunk_remainder;
if (chunk_remainder && crem) {
chunk = ((struct otx2_tim_ent *)
- (uintptr_t)bkt->current_chunk) + crem;
+ mirr_bkt->current_chunk) + crem;
index = tim_cpy_wrk(index, chunk_remainder, chunk, tim,
ents, bkt);
@@ -486,9 +499,9 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
}
if (flags & OTX2_TIM_ENA_FB)
- chunk = tim_refill_chunk(bkt, tim_ring);
+ chunk = tim_refill_chunk(bkt, mirr_bkt, tim_ring);
if (flags & OTX2_TIM_ENA_DFB)
- chunk = tim_insert_chunk(bkt, tim_ring);
+ chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring);
if (unlikely(chunk == NULL)) {
tim_bkt_dec_lock(bkt);
@@ -497,14 +510,14 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
return crem;
}
*(uint64_t *)(chunk + tim_ring->nb_chunk_slots) = 0;
- bkt->current_chunk = (uintptr_t)chunk;
+ mirr_bkt->current_chunk = (uintptr_t)chunk;
tim_cpy_wrk(index, nb_timers, chunk, tim, ents, bkt);
rem = nb_timers - chunk_remainder;
tim_bkt_set_rem(bkt, tim_ring->nb_chunk_slots - rem);
tim_bkt_add_nent(bkt, rem);
} else {
- chunk = (struct otx2_tim_ent *)(uintptr_t)bkt->current_chunk;
+ chunk = (struct otx2_tim_ent *)mirr_bkt->current_chunk;
chunk += (tim_ring->nb_chunk_slots - chunk_remainder);
tim_cpy_wrk(index, nb_timers, chunk, tim, ents, bkt);
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v3 3/5] event/octeontx2: improve chunk pool performance
2019-11-22 15:44 [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition pbhagavatula
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 2/5] event/octeontx2: use opposite bucket to store current chunk pbhagavatula
@ 2019-11-22 15:44 ` pbhagavatula
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 4/5] event/octeontx2: update SSO buffers based on timer count pbhagavatula
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: pbhagavatula @ 2019-11-22 15:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Enable mempool cache for internal mempool to improve alloc performance.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 4 ++--
drivers/event/octeontx2/otx2_tim_worker.h | 15 ++++++++++++++-
2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index e8316a6c7..206ed4331 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -124,6 +124,7 @@ tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
char pool_name[25];
int rc;
+ cache_sz /= rte_lcore_count();
/* Create chunk pool. */
if (rcfg->flags & RTE_EVENT_TIMER_ADAPTER_F_SP_PUT) {
mp_flags = MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET;
@@ -138,10 +139,9 @@ tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
cache_sz = RTE_MEMPOOL_CACHE_MAX_SIZE;
if (!tim_ring->disable_npa) {
- /* NPA need not have cache as free is not visible to SW */
tim_ring->chunk_pool = rte_mempool_create_empty(pool_name,
tim_ring->nb_chunks, tim_ring->chunk_sz,
- 0, 0, rte_socket_id(), mp_flags);
+ cache_sz, 0, rte_socket_id(), mp_flags);
if (tim_ring->chunk_pool == NULL) {
otx2_err("Unable to create chunkpool.");
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index 7b771fbca..af2f864d7 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -144,8 +144,12 @@ static struct otx2_tim_ent *
tim_clr_bkt(struct otx2_tim_ring * const tim_ring,
struct otx2_tim_bkt * const bkt)
{
+#define TIM_MAX_OUTSTANDING_OBJ 64
+ void *pend_chunks[TIM_MAX_OUTSTANDING_OBJ];
struct otx2_tim_ent *chunk;
struct otx2_tim_ent *pnext;
+ uint8_t objs = 0;
+
chunk = ((struct otx2_tim_ent *)(uintptr_t)bkt->first_chunk);
chunk = (struct otx2_tim_ent *)(uintptr_t)(chunk +
@@ -153,10 +157,19 @@ tim_clr_bkt(struct otx2_tim_ring * const tim_ring,
while (chunk) {
pnext = (struct otx2_tim_ent *)(uintptr_t)
((chunk + tim_ring->nb_chunk_slots)->w0);
- rte_mempool_put(tim_ring->chunk_pool, chunk);
+ if (objs == TIM_MAX_OUTSTANDING_OBJ) {
+ rte_mempool_put_bulk(tim_ring->chunk_pool, pend_chunks,
+ objs);
+ objs = 0;
+ }
+ pend_chunks[objs++] = chunk;
chunk = pnext;
}
+ if (objs)
+ rte_mempool_put_bulk(tim_ring->chunk_pool, pend_chunks,
+ objs);
+
return (struct otx2_tim_ent *)(uintptr_t)bkt->first_chunk;
}
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v3 4/5] event/octeontx2: update SSO buffers based on timer count
2019-11-22 15:44 [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition pbhagavatula
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 2/5] event/octeontx2: use opposite bucket to store current chunk pbhagavatula
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 3/5] event/octeontx2: improve chunk pool performance pbhagavatula
@ 2019-11-22 15:44 ` pbhagavatula
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 5/5] event/octeontx2: update start timestamp periodically pbhagavatula
2019-11-22 16:07 ` [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition Pavan Nikhilesh Bhagavatula
4 siblings, 0 replies; 7+ messages in thread
From: pbhagavatula @ 2019-11-22 15:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Update SSO internal XAQ buffers based on number of timers in event timer
adapter.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_evdev.h | 6 +-
drivers/event/octeontx2/otx2_evdev_adptr.c | 84 +++++++++++++++++-----
drivers/event/octeontx2/otx2_tim_evdev.c | 7 +-
drivers/event/octeontx2/otx2_tim_evdev.h | 1 +
4 files changed, 74 insertions(+), 24 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_evdev.h b/drivers/event/octeontx2/otx2_evdev.h
index 530060f81..231a12a52 100644
--- a/drivers/event/octeontx2/otx2_evdev.h
+++ b/drivers/event/octeontx2/otx2_evdev.h
@@ -14,6 +14,7 @@
#include "otx2_dev.h"
#include "otx2_ethdev.h"
#include "otx2_mempool.h"
+#include "otx2_tim_evdev.h"
#define EVENTDEV_NAME_OCTEONTX2_PMD otx2_eventdev
@@ -137,9 +138,12 @@ struct otx2_sso_evdev {
struct rte_mempool *xaq_pool;
uint64_t rx_offloads;
uint64_t tx_offloads;
+ uint64_t adptr_xae_cnt;
uint16_t rx_adptr_pool_cnt;
- uint32_t adptr_xae_cnt;
uint64_t *rx_adptr_pools;
+ uint16_t tim_adptr_ring_cnt;
+ uint16_t *timer_adptr_rings;
+ uint64_t *timer_adptr_sz;
/* Dev args */
uint8_t dual_ws;
uint8_t selftest;
diff --git a/drivers/event/octeontx2/otx2_evdev_adptr.c b/drivers/event/octeontx2/otx2_evdev_adptr.c
index d8a06a593..233cba2aa 100644
--- a/drivers/event/octeontx2/otx2_evdev_adptr.c
+++ b/drivers/event/octeontx2/otx2_evdev_adptr.c
@@ -199,41 +199,87 @@ sso_rxq_disable(struct otx2_eth_dev *dev, uint16_t qid)
void
sso_updt_xae_cnt(struct otx2_sso_evdev *dev, void *data, uint32_t event_type)
{
+ int i;
+
switch (event_type) {
case RTE_EVENT_TYPE_ETHDEV:
{
struct otx2_eth_rxq *rxq = data;
- int i, match = false;
uint64_t *old_ptr;
for (i = 0; i < dev->rx_adptr_pool_cnt; i++) {
if ((uint64_t)rxq->pool == dev->rx_adptr_pools[i])
- match = true;
- }
-
- if (!match) {
- dev->rx_adptr_pool_cnt++;
- old_ptr = dev->rx_adptr_pools;
- dev->rx_adptr_pools = rte_realloc(dev->rx_adptr_pools,
- sizeof(uint64_t) *
- dev->rx_adptr_pool_cnt
- , 0);
- if (dev->rx_adptr_pools == NULL) {
- dev->adptr_xae_cnt += rxq->pool->size;
- dev->rx_adptr_pools = old_ptr;
- dev->rx_adptr_pool_cnt--;
return;
- }
- dev->rx_adptr_pools[dev->rx_adptr_pool_cnt - 1] =
- (uint64_t)rxq->pool;
+ }
+ dev->rx_adptr_pool_cnt++;
+ old_ptr = dev->rx_adptr_pools;
+ dev->rx_adptr_pools = rte_realloc(dev->rx_adptr_pools,
+ sizeof(uint64_t) *
+ dev->rx_adptr_pool_cnt, 0);
+ if (dev->rx_adptr_pools == NULL) {
dev->adptr_xae_cnt += rxq->pool->size;
+ dev->rx_adptr_pools = old_ptr;
+ dev->rx_adptr_pool_cnt--;
+ return;
}
+ dev->rx_adptr_pools[dev->rx_adptr_pool_cnt - 1] =
+ (uint64_t)rxq->pool;
+
+ dev->adptr_xae_cnt += rxq->pool->size;
break;
}
case RTE_EVENT_TYPE_TIMER:
{
- dev->adptr_xae_cnt += (*(uint64_t *)data);
+ struct otx2_tim_ring *timr = data;
+ uint16_t *old_ring_ptr;
+ uint64_t *old_sz_ptr;
+
+ for (i = 0; i < dev->tim_adptr_ring_cnt; i++) {
+ if (timr->ring_id != dev->timer_adptr_rings[i])
+ continue;
+ if (timr->nb_timers == dev->timer_adptr_sz[i])
+ return;
+ dev->adptr_xae_cnt -= dev->timer_adptr_sz[i];
+ dev->adptr_xae_cnt += timr->nb_timers;
+ dev->timer_adptr_sz[i] = timr->nb_timers;
+
+ return;
+ }
+
+ dev->tim_adptr_ring_cnt++;
+ old_ring_ptr = dev->timer_adptr_rings;
+ old_sz_ptr = dev->timer_adptr_sz;
+
+ dev->timer_adptr_rings = rte_realloc(dev->timer_adptr_rings,
+ sizeof(uint16_t) *
+ dev->tim_adptr_ring_cnt,
+ 0);
+ if (dev->timer_adptr_rings == NULL) {
+ dev->adptr_xae_cnt += timr->nb_timers;
+ dev->timer_adptr_rings = old_ring_ptr;
+ dev->tim_adptr_ring_cnt--;
+ return;
+ }
+
+ dev->timer_adptr_sz = rte_realloc(dev->timer_adptr_sz,
+ sizeof(uint64_t) *
+ dev->tim_adptr_ring_cnt,
+ 0);
+
+ if (dev->timer_adptr_sz == NULL) {
+ dev->adptr_xae_cnt += timr->nb_timers;
+ dev->timer_adptr_sz = old_sz_ptr;
+ dev->tim_adptr_ring_cnt--;
+ return;
+ }
+
+ dev->timer_adptr_rings[dev->tim_adptr_ring_cnt - 1] =
+ timr->ring_id;
+ dev->timer_adptr_sz[dev->tim_adptr_ring_cnt - 1] =
+ timr->nb_timers;
+
+ dev->adptr_xae_cnt += timr->nb_timers;
break;
}
default:
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 206ed4331..5f0233f44 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -254,7 +254,6 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
struct tim_ring_req *free_req;
struct tim_lf_alloc_req *req;
struct tim_lf_alloc_rsp *rsp;
- uint64_t nb_timers;
int i, rc;
if (dev == NULL)
@@ -300,7 +299,7 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
tim_ring->max_tout = rcfg->max_tmo_ns;
tim_ring->nb_bkts = (tim_ring->max_tout / tim_ring->tck_nsec);
tim_ring->chunk_sz = dev->chunk_sz;
- nb_timers = rcfg->nb_timers;
+ tim_ring->nb_timers = rcfg->nb_timers;
tim_ring->disable_npa = dev->disable_npa;
tim_ring->enable_stats = dev->enable_stats;
@@ -316,7 +315,7 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
}
}
- tim_ring->nb_chunks = nb_timers / OTX2_TIM_NB_CHUNK_SLOTS(
+ tim_ring->nb_chunks = tim_ring->nb_timers / OTX2_TIM_NB_CHUNK_SLOTS(
tim_ring->chunk_sz);
tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
@@ -373,7 +372,7 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
tim_set_fp_ops(tim_ring);
/* Update SSO xae count. */
- sso_updt_xae_cnt(sso_pmd_priv(dev->event_dev), (void *)&nb_timers,
+ sso_updt_xae_cnt(sso_pmd_priv(dev->event_dev), (void *)tim_ring,
RTE_EVENT_TYPE_TIMER);
sso_xae_reconfigure(dev->event_dev);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index eec0189c1..f3fe9697a 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -154,6 +154,7 @@ struct otx2_tim_ring {
uint8_t ena_dfb;
uint16_t ring_id;
uint32_t aura;
+ uint64_t nb_timers;
uint64_t tck_nsec;
uint64_t max_tout;
uint64_t nb_chunks;
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v3 5/5] event/octeontx2: update start timestamp periodically
2019-11-22 15:44 [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition pbhagavatula
` (2 preceding siblings ...)
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 4/5] event/octeontx2: update SSO buffers based on timer count pbhagavatula
@ 2019-11-22 15:44 ` pbhagavatula
2019-11-22 16:07 ` [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition Pavan Nikhilesh Bhagavatula
4 siblings, 0 replies; 7+ messages in thread
From: pbhagavatula @ 2019-11-22 15:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Update start timestamp periodically to prevent drift.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 28 +++++++++++++++++++++++
drivers/event/octeontx2/otx2_tim_evdev.h | 7 ++++--
drivers/event/octeontx2/otx2_tim_worker.c | 19 +++++++++++++++
3 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 5f0233f44..b275c6922 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -389,6 +389,31 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
return rc;
}
+static void
+otx2_tim_calibrate_start_tsc(struct otx2_tim_ring *tim_ring)
+{
+#define OTX2_TIM_CALIB_ITER 1E6
+ uint32_t real_bkt, bucket;
+ int icount, ecount = 0;
+ uint64_t bkt_cyc;
+
+ for (icount = 0; icount < OTX2_TIM_CALIB_ITER; icount++) {
+ real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
+ bkt_cyc = rte_rdtsc();
+ bucket = (bkt_cyc - tim_ring->ring_start_cyc) /
+ tim_ring->tck_int;
+ bucket = bucket % (tim_ring->nb_bkts);
+ tim_ring->ring_start_cyc = bkt_cyc - (real_bkt *
+ tim_ring->tck_int);
+ if (bucket != real_bkt)
+ ecount++;
+ }
+ tim_ring->last_updt_cyc = bkt_cyc;
+ otx2_tim_dbg("Bucket mispredict %3.2f distance %d\n",
+ 100 - (((double)(icount - ecount) / (double)icount) * 100),
+ bucket - real_bkt);
+}
+
static int
otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
{
@@ -423,8 +448,11 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_ring->ring_start_cyc = rsp->timestarted;
#endif
tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
+ tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
+ otx2_tim_calibrate_start_tsc(tim_ring);
+
fail:
return rc;
}
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index f3fe9697a..56895dcbf 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -25,6 +25,7 @@
#define TIM_LF_RAS_INT_W1S (0x308)
#define TIM_LF_RAS_INT_ENA_W1S (0x310)
#define TIM_LF_RAS_INT_ENA_W1C (0x318)
+#define TIM_LF_RING_REL (0x400)
#define TIM_BUCKET_W1_S_CHUNK_REMAINDER (48)
#define TIM_BUCKET_W1_M_CHUNK_REMAINDER ((1ULL << (64 - \
@@ -139,13 +140,15 @@ struct otx2_tim_evdev {
struct otx2_tim_ring {
uintptr_t base;
- struct rte_reciprocal_u64 fast_div;
uint16_t nb_chunk_slots;
uint32_t nb_bkts;
+ uint64_t last_updt_cyc;
uint64_t ring_start_cyc;
+ uint64_t tck_int;
+ uint64_t tot_int;
struct otx2_tim_bkt *bkt;
struct rte_mempool *chunk_pool;
- uint64_t tck_int;
+ struct rte_reciprocal_u64 fast_div;
rte_atomic64_t arm_cnt;
uint8_t prod_type_sp;
uint8_t enable_stats;
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index feba61cd4..104674c79 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -38,6 +38,23 @@ tim_format_event(const struct rte_event_timer * const tim,
entry->wqe = tim->ev.u64;
}
+static inline void
+tim_sync_start_cyc(struct otx2_tim_ring *tim_ring)
+{
+ uint64_t cur_cyc = rte_rdtsc();
+ uint32_t real_bkt;
+
+ if (cur_cyc - tim_ring->last_updt_cyc > tim_ring->tot_int) {
+ real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
+ cur_cyc = rte_rdtsc();
+
+ tim_ring->ring_start_cyc = cur_cyc -
+ (real_bkt * tim_ring->tck_int);
+ tim_ring->last_updt_cyc = cur_cyc;
+ }
+
+}
+
static __rte_always_inline uint16_t
tim_timer_arm_burst(const struct rte_event_timer_adapter *adptr,
struct rte_event_timer **tim,
@@ -49,6 +66,7 @@ tim_timer_arm_burst(const struct rte_event_timer_adapter *adptr,
uint16_t index;
int ret;
+ tim_sync_start_cyc(tim_ring);
for (index = 0; index < nb_timers; index++) {
if (tim_arm_checks(tim_ring, tim[index]))
break;
@@ -99,6 +117,7 @@ tim_timer_arm_tmo_brst(const struct rte_event_timer_adapter *adptr,
return 0;
}
+ tim_sync_start_cyc(tim_ring);
while (arr_idx < nb_timers) {
for (idx = 0; idx < OTX2_TIM_MAX_BURST && (arr_idx < nb_timers);
idx++, arr_idx++) {
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition
2019-11-22 15:44 [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition pbhagavatula
` (3 preceding siblings ...)
2019-11-22 15:44 ` [dpdk-dev] [PATCH v3 5/5] event/octeontx2: update start timestamp periodically pbhagavatula
@ 2019-11-22 16:07 ` Pavan Nikhilesh Bhagavatula
2019-11-23 9:00 ` Jerin Jacob
4 siblings, 1 reply; 7+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2019-11-22 16:07 UTC (permalink / raw)
To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran; +Cc: dev, dpdk stable
+Cc: stable@dpdk.org
>-----Original Message-----
>From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
>Sent: Friday, November 22, 2019 9:14 PM
>To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Pavan Nikhilesh
>Bhagavatula <pbhagavatula@marvell.com>
>Cc: dev@dpdk.org
>Subject: [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race
>condition
>
>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Fix HW race condition observed when timeout resolution is low (<5us).
>When HW traverses a given TIM bucket it will clear chunk_remainder,
>but since SW always decreases the chunk_remainder at the start of the
>arm routine it might cause a race where SW updates chunk_remainder
>after HW has cleared it that lead to nasty side effects.
>
>Fixes: 95e4e4ec7469 ("event/octeontx2: add timer arm timeout burst")
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>---
> drivers/event/octeontx2/otx2_tim_worker.h | 141
>+++++++++++++++++++---
> 1 file changed, 124 insertions(+), 17 deletions(-)
>
>diff --git a/drivers/event/octeontx2/otx2_tim_worker.h
>b/drivers/event/octeontx2/otx2_tim_worker.h
>index 50db6543c..c896b5433 100644
>--- a/drivers/event/octeontx2/otx2_tim_worker.h
>+++ b/drivers/event/octeontx2/otx2_tim_worker.h
>@@ -7,6 +7,13 @@
>
> #include "otx2_tim_evdev.h"
>
>+static inline uint8_t
>+tim_bkt_fetch_lock(uint64_t w1)
>+{
>+ return (w1 >> TIM_BUCKET_W1_S_LOCK) &
>+ TIM_BUCKET_W1_M_LOCK;
>+}
>+
> static inline int16_t
> tim_bkt_fetch_rem(uint64_t w1)
> {
>@@ -188,7 +195,6 @@ tim_insert_chunk(struct otx2_tim_bkt * const
>bkt,
> } else {
> bkt->first_chunk = (uintptr_t)chunk;
> }
>-
> return chunk;
> }
>
>@@ -208,11 +214,38 @@ tim_add_entry_sp(struct otx2_tim_ring *
>const tim_ring,
>
> __retry:
> /* Get Bucket sema*/
>- lock_sema = tim_bkt_fetch_sema(bkt);
>+ lock_sema = tim_bkt_fetch_sema_lock(bkt);
>
> /* Bucket related checks. */
>- if (unlikely(tim_bkt_get_hbt(lock_sema)))
>- goto __retry;
>+ if (unlikely(tim_bkt_get_hbt(lock_sema))) {
>+ if (tim_bkt_get_nent(lock_sema) != 0) {
>+ uint64_t hbt_state;
>+#ifdef RTE_ARCH_ARM64
>+ asm volatile(
>+ " ldaxr %[hbt], [%[w1]]
> \n"
>+ " tbz %[hbt], 33, dne%=
> \n"
>+ " sevl
> \n"
>+ "rty%=: wfe
> \n"
>+ " ldaxr %[hbt], [%[w1]]
> \n"
>+ " tbnz %[hbt], 33, rty%=
> \n"
>+ "dne%=:
> \n"
>+ : [hbt] "=&r" (hbt_state)
>+ : [w1] "r" ((&bkt->w1))
>+ : "memory"
>+ );
>+#else
>+ do {
>+ hbt_state = __atomic_load_n(&bkt-
>>w1,
>+ __ATOMIC_ACQUIRE);
>+ } while (hbt_state & BIT_ULL(33));
>+#endif
>+
>+ if (!(hbt_state & BIT_ULL(34))) {
>+ tim_bkt_dec_lock(bkt);
>+ goto __retry;
>+ }
>+ }
>+ }
>
> /* Insert the work. */
> rem = tim_bkt_fetch_rem(lock_sema);
>@@ -224,14 +257,15 @@ tim_add_entry_sp(struct otx2_tim_ring *
>const tim_ring,
> chunk = tim_insert_chunk(bkt, tim_ring);
>
> if (unlikely(chunk == NULL)) {
>- tim_bkt_set_rem(bkt, 0);
>+ bkt->chunk_remainder = 0;
>+ tim_bkt_dec_lock(bkt);
> tim->impl_opaque[0] = 0;
> tim->impl_opaque[1] = 0;
> tim->state = RTE_EVENT_TIMER_ERROR;
> return -ENOMEM;
> }
> bkt->current_chunk = (uintptr_t)chunk;
>- tim_bkt_set_rem(bkt, tim_ring->nb_chunk_slots - 1);
>+ bkt->chunk_remainder = tim_ring->nb_chunk_slots - 1;
> } else {
> chunk = (struct otx2_tim_ent *)(uintptr_t)bkt-
>>current_chunk;
> chunk += tim_ring->nb_chunk_slots - rem;
>@@ -241,6 +275,7 @@ tim_add_entry_sp(struct otx2_tim_ring * const
>tim_ring,
> *chunk = *pent;
>
> tim_bkt_inc_nent(bkt);
>+ tim_bkt_dec_lock(bkt);
>
> tim->impl_opaque[0] = (uintptr_t)chunk;
> tim->impl_opaque[1] = (uintptr_t)bkt;
>@@ -263,19 +298,60 @@ tim_add_entry_mp(struct otx2_tim_ring *
>const tim_ring,
>
> __retry:
> bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags);
>-
> /* Get Bucket sema*/
> lock_sema = tim_bkt_fetch_sema_lock(bkt);
>
> /* Bucket related checks. */
> if (unlikely(tim_bkt_get_hbt(lock_sema))) {
>- tim_bkt_dec_lock(bkt);
>- goto __retry;
>+ if (tim_bkt_get_nent(lock_sema) != 0) {
>+ uint64_t hbt_state;
>+#ifdef RTE_ARCH_ARM64
>+ asm volatile(
>+ " ldaxr %[hbt], [%[w1]]
> \n"
>+ " tbz %[hbt], 33, dne%=
> \n"
>+ " sevl
> \n"
>+ "rty%=: wfe
> \n"
>+ " ldaxr %[hbt], [%[w1]]
> \n"
>+ " tbnz %[hbt], 33, rty%=
> \n"
>+ "dne%=:
> \n"
>+ : [hbt] "=&r" (hbt_state)
>+ : [w1] "r" ((&bkt->w1))
>+ : "memory"
>+ );
>+#else
>+ do {
>+ hbt_state = __atomic_load_n(&bkt-
>>w1,
>+ __ATOMIC_ACQUIRE);
>+ } while (hbt_state & BIT_ULL(33));
>+#endif
>+
>+ if (!(hbt_state & BIT_ULL(34))) {
>+ tim_bkt_dec_lock(bkt);
>+ goto __retry;
>+ }
>+ }
> }
>
> rem = tim_bkt_fetch_rem(lock_sema);
>-
> if (rem < 0) {
>+#ifdef RTE_ARCH_ARM64
>+ asm volatile(
>+ " ldaxrh %w[rem], [%[crem]]
> \n"
>+ " tbz %w[rem], 15, dne%=
> \n"
>+ " sevl
> \n"
>+ "rty%=: wfe
> \n"
>+ " ldaxrh %w[rem], [%[crem]]
> \n"
>+ " tbnz %w[rem], 15, rty%=
> \n"
>+ "dne%=:
> \n"
>+ : [rem] "=&r" (rem)
>+ : [crem] "r" (&bkt->chunk_remainder)
>+ : "memory"
>+ );
>+#else
>+ while (__atomic_load_n(&bkt->chunk_remainder,
>+ __ATOMIC_ACQUIRE) < 0)
>+ ;
>+#endif
> /* Goto diff bucket. */
> tim_bkt_dec_lock(bkt);
> goto __retry;
>@@ -294,17 +370,23 @@ tim_add_entry_mp(struct otx2_tim_ring *
>const tim_ring,
> tim->state = RTE_EVENT_TIMER_ERROR;
> return -ENOMEM;
> }
>- bkt->current_chunk = (uintptr_t)chunk;
>- tim_bkt_set_rem(bkt, tim_ring->nb_chunk_slots - 1);
>+ *chunk = *pent;
>+ while (tim_bkt_fetch_lock(lock_sema) !=
>+ (-tim_bkt_fetch_rem(lock_sema)))
>+ lock_sema = __atomic_load_n(&bkt->w1,
>__ATOMIC_ACQUIRE);
>+
>+ bkt->current_chunk = (uintptr_t)chunk;
>+ __atomic_store_n(&bkt->chunk_remainder,
>+ tim_ring->nb_chunk_slots - 1,
>__ATOMIC_RELEASE);
> } else {
>- chunk = (struct otx2_tim_ent *)(uintptr_t)bkt-
>>current_chunk;
>+ chunk = (struct otx2_tim_ent *)bkt->current_chunk;
> chunk += tim_ring->nb_chunk_slots - rem;
>+ *chunk = *pent;
> }
>
> /* Copy work entry. */
>- *chunk = *pent;
>- tim_bkt_dec_lock(bkt);
> tim_bkt_inc_nent(bkt);
>+ tim_bkt_dec_lock(bkt);
> tim->impl_opaque[0] = (uintptr_t)chunk;
> tim->impl_opaque[1] = (uintptr_t)bkt;
> tim->state = RTE_EVENT_TIMER_ARMED;
>@@ -360,8 +442,33 @@ tim_add_entry_brst(struct otx2_tim_ring *
>const tim_ring,
>
> /* Bucket related checks. */
> if (unlikely(tim_bkt_get_hbt(lock_sema))) {
>- tim_bkt_dec_lock(bkt);
>- goto __retry;
>+ if (tim_bkt_get_nent(lock_sema) != 0) {
>+ uint64_t hbt_state;
>+#ifdef RTE_ARCH_ARM64
>+ asm volatile(
>+ " ldaxr %[hbt], [%[w1]]
> \n"
>+ " tbz %[hbt], 33, dne%=
> \n"
>+ " sevl
> \n"
>+ "rty%=: wfe
> \n"
>+ " ldaxr %[hbt], [%[w1]]
> \n"
>+ " tbnz %[hbt], 33, rty%=
> \n"
>+ "dne%=:
> \n"
>+ : [hbt] "=&r" (hbt_state)
>+ : [w1] "r" ((&bkt->w1))
>+ : "memory"
>+ );
>+#else
>+ do {
>+ hbt_state = __atomic_load_n(&bkt-
>>w1,
>+ __ATOMIC_ACQUIRE);
>+ } while (hbt_state & BIT_ULL(33));
>+#endif
>+
>+ if (!(hbt_state & BIT_ULL(34))) {
>+ tim_bkt_dec_lock(bkt);
>+ goto __retry;
>+ }
>+ }
> }
>
> chunk_remainder = tim_bkt_fetch_rem(lock_sema);
>--
>2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition
2019-11-22 16:07 ` [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race condition Pavan Nikhilesh Bhagavatula
@ 2019-11-23 9:00 ` Jerin Jacob
0 siblings, 0 replies; 7+ messages in thread
From: Jerin Jacob @ 2019-11-23 9:00 UTC (permalink / raw)
To: Pavan Nikhilesh Bhagavatula; +Cc: Jerin Jacob Kollanukkaran, dev, dpdk stable
On Sat, Nov 23, 2019 at 1:07 AM Pavan Nikhilesh Bhagavatula
<pbhagavatula@marvell.com> wrote:
>
> +Cc: stable@dpdk.org
>
> >-----Original Message-----
> >From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> >Sent: Friday, November 22, 2019 9:14 PM
> >To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Pavan Nikhilesh
> >Bhagavatula <pbhagavatula@marvell.com>
> >Cc: dev@dpdk.org
> >Subject: [dpdk-dev] [PATCH v3 1/5] event/octeontx2: fix TIM HW race
> >condition
> >
> >From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> >Fix HW race condition observed when timeout resolution is low (<5us).
> >When HW traverses a given TIM bucket it will clear chunk_remainder,
> >but since SW always decreases the chunk_remainder at the start of the
> >arm routine it might cause a race where SW updates chunk_remainder
> >after HW has cleared it that lead to nasty side effects.
> >
> >Fixes: 95e4e4ec7469 ("event/octeontx2: add timer arm timeout burst")
> >
> >Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Series applied to dpdk-next-eventdev/master. Thanks.
^ permalink raw reply [flat|nested] 7+ messages in thread