* [dpdk-dev] [PATCH 1/4] event/octeontx2: simplify timer bucket estimation
@ 2021-02-25 12:23 pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
` (3 more replies)
0 siblings, 4 replies; 17+ messages in thread
From: pbhagavatula @ 2021-02-25 12:23 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Simplify timer bucket estimation we need not align buckets to
power of 2 instead use reciprocal division to compute mod.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 78 ++++-----------------
drivers/event/octeontx2/otx2_tim_evdev.h | 84 ++++++++---------------
drivers/event/octeontx2/otx2_tim_worker.c | 4 +-
drivers/event/octeontx2/otx2_tim_worker.h | 40 +++++------
4 files changed, 64 insertions(+), 142 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 4c24cc8a6..d1e967eb7 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -34,27 +34,25 @@ tim_set_fp_ops(struct otx2_tim_ring *tim_ring)
{
uint8_t prod_flag = !tim_ring->prod_type_sp;
- /* [MOD/AND] [DFB/FB] [SP][MP]*/
- const rte_event_timer_arm_burst_t arm_burst[2][2][2][2] = {
-#define FP(_name, _f4, _f3, _f2, _f1, flags) \
- [_f4][_f3][_f2][_f1] = otx2_tim_arm_burst_ ## _name,
-TIM_ARM_FASTPATH_MODES
+ /* [DFB/FB] [SP][MP]*/
+ const rte_event_timer_arm_burst_t arm_burst[2][2][2] = {
+#define FP(_name, _f3, _f2, _f1, flags) \
+ [_f3][_f2][_f1] = otx2_tim_arm_burst_##_name,
+ TIM_ARM_FASTPATH_MODES
#undef FP
};
- const rte_event_timer_arm_tmo_tick_burst_t arm_tmo_burst[2][2][2] = {
-#define FP(_name, _f3, _f2, _f1, flags) \
- [_f3][_f2][_f1] = otx2_tim_arm_tmo_tick_burst_ ## _name,
-TIM_ARM_TMO_FASTPATH_MODES
+ const rte_event_timer_arm_tmo_tick_burst_t arm_tmo_burst[2][2] = {
+#define FP(_name, _f2, _f1, flags) \
+ [_f2][_f1] = otx2_tim_arm_tmo_tick_burst_##_name,
+ TIM_ARM_TMO_FASTPATH_MODES
#undef FP
};
otx2_tim_ops.arm_burst =
- arm_burst[tim_ring->enable_stats][tim_ring->optimized]
- [tim_ring->ena_dfb][prod_flag];
+ arm_burst[tim_ring->enable_stats][tim_ring->ena_dfb][prod_flag];
otx2_tim_ops.arm_tmo_tick_burst =
- arm_tmo_burst[tim_ring->enable_stats][tim_ring->optimized]
- [tim_ring->ena_dfb];
+ arm_tmo_burst[tim_ring->enable_stats][tim_ring->ena_dfb];
otx2_tim_ops.cancel_burst = otx2_tim_timer_cancel_burst;
}
@@ -70,51 +68,6 @@ otx2_tim_ring_info_get(const struct rte_event_timer_adapter *adptr,
sizeof(struct rte_event_timer_adapter_conf));
}
-static void
-tim_optimze_bkt_param(struct otx2_tim_ring *tim_ring)
-{
- uint64_t tck_nsec;
- uint32_t hbkts;
- uint32_t lbkts;
-
- hbkts = rte_align32pow2(tim_ring->nb_bkts);
- tck_nsec = RTE_ALIGN_MUL_CEIL(tim_ring->max_tout / (hbkts - 1), 10);
-
- if ((tck_nsec < TICK2NSEC(OTX2_TIM_MIN_TMO_TKS,
- tim_ring->tenns_clk_freq) ||
- hbkts > OTX2_TIM_MAX_BUCKETS))
- hbkts = 0;
-
- lbkts = rte_align32prevpow2(tim_ring->nb_bkts);
- tck_nsec = RTE_ALIGN_MUL_CEIL((tim_ring->max_tout / (lbkts - 1)), 10);
-
- if ((tck_nsec < TICK2NSEC(OTX2_TIM_MIN_TMO_TKS,
- tim_ring->tenns_clk_freq) ||
- lbkts > OTX2_TIM_MAX_BUCKETS))
- lbkts = 0;
-
- if (!hbkts && !lbkts)
- return;
-
- if (!hbkts) {
- tim_ring->nb_bkts = lbkts;
- goto end;
- } else if (!lbkts) {
- tim_ring->nb_bkts = hbkts;
- goto end;
- }
-
- tim_ring->nb_bkts = (hbkts - tim_ring->nb_bkts) <
- (tim_ring->nb_bkts - lbkts) ? hbkts : lbkts;
-end:
- tim_ring->optimized = true;
- tim_ring->tck_nsec = RTE_ALIGN_MUL_CEIL((tim_ring->max_tout /
- (tim_ring->nb_bkts - 1)), 10);
- otx2_tim_dbg("Optimized configured values");
- otx2_tim_dbg("Nb_bkts : %" PRIu32 "", tim_ring->nb_bkts);
- otx2_tim_dbg("Tck_nsec : %" PRIu64 "", tim_ring->tck_nsec);
-}
-
static int
tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
struct rte_event_timer_adapter_conf *rcfg)
@@ -319,14 +272,6 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
tim_ring->chunk_sz);
tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
- /* Try to optimize the bucket parameters. */
- if ((rcfg->flags & RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES)) {
- if (rte_is_power_of_2(tim_ring->nb_bkts))
- tim_ring->optimized = true;
- else
- tim_optimze_bkt_param(tim_ring);
- }
-
if (tim_ring->disable_npa)
tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
else
@@ -459,6 +404,7 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
+ tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
otx2_tim_calibrate_start_tsc(tim_ring);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index 44e3c7b51..bf89b85b0 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -76,8 +76,6 @@
#define OTX2_TIM_SP 0x1
#define OTX2_TIM_MP 0x2
-#define OTX2_TIM_BKT_AND 0x4
-#define OTX2_TIM_BKT_MOD 0x8
#define OTX2_TIM_ENA_FB 0x10
#define OTX2_TIM_ENA_DFB 0x20
#define OTX2_TIM_ENA_STATS 0x40
@@ -149,11 +147,11 @@ struct otx2_tim_ring {
struct otx2_tim_bkt *bkt;
struct rte_mempool *chunk_pool;
struct rte_reciprocal_u64 fast_div;
+ struct rte_reciprocal_u64 fast_bkt;
uint64_t arm_cnt;
uint8_t prod_type_sp;
uint8_t enable_stats;
uint8_t disable_npa;
- uint8_t optimized;
uint8_t ena_dfb;
uint16_t ring_id;
uint32_t aura;
@@ -178,60 +176,38 @@ tim_priv_get(void)
return mz->addr;
}
-#define TIM_ARM_FASTPATH_MODES \
-FP(mod_sp, 0, 0, 0, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(mod_mp, 0, 0, 0, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(mod_fb_sp, 0, 0, 1, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(mod_fb_mp, 0, 0, 1, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(and_sp, 0, 1, 0, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(and_mp, 0, 1, 0, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(and_fb_sp, 0, 1, 1, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(and_fb_mp, 0, 1, 1, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(stats_mod_sp, 1, 0, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(stats_mod_mp, 1, 0, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(stats_mod_fb_sp, 1, 0, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(stats_mod_fb_mp, 1, 0, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(stats_and_sp, 1, 1, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(stats_and_mp, 1, 1, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(stats_and_fb_sp, 1, 1, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(stats_and_fb_mp, 1, 1, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB | OTX2_TIM_MP)
-
-#define TIM_ARM_TMO_FASTPATH_MODES \
-FP(mod, 0, 0, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB) \
-FP(mod_fb, 0, 0, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB) \
-FP(and, 0, 1, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB) \
-FP(and_fb, 0, 1, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB) \
-FP(stats_mod, 1, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB) \
-FP(stats_mod_fb, 1, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB) \
-FP(stats_and, 1, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB) \
-FP(stats_and_fb, 1, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB)
-
-#define FP(_name, _f4, _f3, _f2, _f1, flags) \
-uint16_t \
-otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
- struct rte_event_timer **tim, \
- const uint16_t nb_timers);
+#define TIM_ARM_FASTPATH_MODES \
+ FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
+ FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
+ FP(fb_sp, 0, 1, 0, OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
+ FP(fb_mp, 0, 1, 1, OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
+ FP(stats_mod_sp, 1, 0, 0, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
+ FP(stats_mod_mp, 1, 0, 1, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
+ FP(stats_mod_fb_sp, 1, 1, 0, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
+ FP(stats_mod_fb_mp, 1, 1, 1, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB | OTX2_TIM_MP)
+
+#define TIM_ARM_TMO_FASTPATH_MODES \
+ FP(dfb, 0, 0, OTX2_TIM_ENA_DFB) \
+ FP(fb, 0, 1, OTX2_TIM_ENA_FB) \
+ FP(stats_dfb, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB) \
+ FP(stats_fb, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB)
+
+#define FP(_name, _f3, _f2, _f1, flags) \
+ uint16_t otx2_tim_arm_burst_##_name( \
+ const struct rte_event_timer_adapter *adptr, \
+ struct rte_event_timer **tim, const uint16_t nb_timers);
TIM_ARM_FASTPATH_MODES
#undef FP
-#define FP(_name, _f3, _f2, _f1, flags) \
-uint16_t \
-otx2_tim_arm_tmo_tick_burst_ ## _name( \
- const struct rte_event_timer_adapter *adptr, \
- struct rte_event_timer **tim, \
- const uint64_t timeout_tick, const uint16_t nb_timers);
+#define FP(_name, _f2, _f1, flags) \
+ uint16_t otx2_tim_arm_tmo_tick_burst_##_name( \
+ const struct rte_event_timer_adapter *adptr, \
+ struct rte_event_timer **tim, const uint64_t timeout_tick, \
+ const uint16_t nb_timers);
TIM_ARM_TMO_FASTPATH_MODES
#undef FP
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index 4b5cfdc72..eb901844d 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -136,7 +136,7 @@ tim_timer_arm_tmo_brst(const struct rte_event_timer_adapter *adptr,
return set_timers;
}
-#define FP(_name, _f4, _f3, _f2, _f1, _flags) \
+#define FP(_name, _f3, _f2, _f1, _flags) \
uint16_t __rte_noinline \
otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
struct rte_event_timer **tim, \
@@ -147,7 +147,7 @@ otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
TIM_ARM_FASTPATH_MODES
#undef FP
-#define FP(_name, _f3, _f2, _f1, _flags) \
+#define FP(_name, _f2, _f1, _flags) \
uint16_t __rte_noinline \
otx2_tim_arm_tmo_tick_burst_ ## _name( \
const struct rte_event_timer_adapter *adptr, \
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index af2f864d7..f03912b81 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -115,27 +115,27 @@ tim_bkt_clr_nent(struct otx2_tim_bkt *bktp)
return __atomic_and_fetch(&bktp->w1, v, __ATOMIC_ACQ_REL);
}
+static inline uint64_t
+tim_bkt_fast_mod(uint64_t n, uint64_t d, struct rte_reciprocal_u64 R)
+{
+ return (n - (d * rte_reciprocal_divide_u64(n, &R)));
+}
+
static __rte_always_inline void
-tim_get_target_bucket(struct otx2_tim_ring * const tim_ring,
+tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
- struct otx2_tim_bkt **mirr_bkt, const uint8_t flag)
+ struct otx2_tim_bkt **mirr_bkt)
{
const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
- uint32_t bucket = rte_reciprocal_divide_u64(bkt_cyc,
- &tim_ring->fast_div) + rel_bkt;
- uint32_t mirr_bucket = 0;
-
- if (flag & OTX2_TIM_BKT_MOD) {
- bucket = bucket % tim_ring->nb_bkts;
- mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) %
- tim_ring->nb_bkts;
- }
- if (flag & OTX2_TIM_BKT_AND) {
- bucket = bucket & (tim_ring->nb_bkts - 1);
- mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) &
- (tim_ring->nb_bkts - 1);
- }
-
+ uint64_t bucket =
+ rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
+ rel_bkt;
+ uint64_t mirr_bucket = 0;
+
+ bucket =
+ tim_bkt_fast_mod(bucket, tim_ring->nb_bkts, tim_ring->fast_bkt);
+ mirr_bucket = tim_bkt_fast_mod(bucket + (tim_ring->nb_bkts >> 1),
+ tim_ring->nb_bkts, tim_ring->fast_bkt);
*bkt = &tim_ring->bkt[bucket];
*mirr_bkt = &tim_ring->bkt[mirr_bucket];
}
@@ -236,7 +236,7 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
int16_t rem;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -322,7 +322,7 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
int16_t rem;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -454,7 +454,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
uint8_t lock_cnt;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Only one thread beyond this. */
lock_sema = tim_bkt_inc_lock(bkt);
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH 2/4] event/octeontx2: optimize timer arm routine
2021-02-25 12:23 [dpdk-dev] [PATCH 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
@ 2021-02-25 12:23 ` pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
` (2 subsequent siblings)
3 siblings, 0 replies; 17+ messages in thread
From: pbhagavatula @ 2021-02-25 12:23 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use relaxed load exclusive when polling for other threads or
hardware to complete.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_worker.c | 1 +
drivers/event/octeontx2/otx2_tim_worker.h | 160 ++++++++++++----------
2 files changed, 89 insertions(+), 72 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index eb901844d..9946be733 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -170,6 +170,7 @@ otx2_tim_timer_cancel_burst(const struct rte_event_timer_adapter *adptr,
int ret;
RTE_SET_USED(adptr);
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
for (index = 0; index < nb_timers; index++) {
if (tim[index]->state == RTE_EVENT_TIMER_CANCELED) {
rte_errno = EALREADY;
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index f03912b81..9d8bcb974 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -84,7 +84,13 @@ tim_bkt_inc_lock(struct otx2_tim_bkt *bktp)
static inline void
tim_bkt_dec_lock(struct otx2_tim_bkt *bktp)
{
- __atomic_add_fetch(&bktp->lock, 0xff, __ATOMIC_RELEASE);
+ __atomic_fetch_sub(&bktp->lock, 1, __ATOMIC_RELEASE);
+}
+
+static inline void
+tim_bkt_dec_lock_relaxed(struct otx2_tim_bkt *bktp)
+{
+ __atomic_fetch_sub(&bktp->lock, 1, __ATOMIC_RELAXED);
}
static inline uint32_t
@@ -246,22 +252,20 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -282,10 +286,10 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
if (unlikely(chunk == NULL)) {
bkt->chunk_remainder = 0;
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
tim->state = RTE_EVENT_TIMER_ERROR;
+ tim_bkt_dec_lock(bkt);
return -ENOMEM;
}
mirr_bkt->current_chunk = (uintptr_t)chunk;
@@ -298,12 +302,12 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
/* Copy work entry. */
*chunk = *pent;
- tim_bkt_inc_nent(bkt);
- tim_bkt_dec_lock(bkt);
-
tim->impl_opaque[0] = (uintptr_t)chunk;
tim->impl_opaque[1] = (uintptr_t)bkt;
tim->state = RTE_EVENT_TIMER_ARMED;
+ rte_smp_wmb();
+ tim_bkt_inc_nent(bkt);
+ tim_bkt_dec_lock_relaxed(bkt);
return 0;
}
@@ -331,22 +335,20 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -359,26 +361,23 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
rem = tim_bkt_fetch_rem(lock_sema);
if (rem < 0) {
+ tim_bkt_dec_lock(bkt);
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxrh %w[rem], [%[crem]] \n"
- " tbz %w[rem], 15, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxrh %w[rem], [%[crem]] \n"
- " tbnz %w[rem], 15, rty%= \n"
- "dne%=: \n"
- : [rem] "=&r" (rem)
- : [crem] "r" (&bkt->chunk_remainder)
- : "memory"
- );
+ asm volatile(" ldxr %[rem], [%[crem]] \n"
+ " tbz %[rem], 63, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[rem], [%[crem]] \n"
+ " tbnz %[rem], 63, rty%= \n"
+ "dne%=: \n"
+ : [rem] "=&r"(rem)
+ : [crem] "r"(&bkt->w1)
+ : "memory");
#else
- while (__atomic_load_n(&bkt->chunk_remainder,
- __ATOMIC_ACQUIRE) < 0)
+ while (__atomic_load_n((int64_t *)&bkt->w1, __ATOMIC_RELAXED) <
+ 0)
;
#endif
- /* Goto diff bucket. */
- tim_bkt_dec_lock(bkt);
goto __retry;
} else if (!rem) {
/* Only one thread can be here*/
@@ -388,18 +387,21 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring);
if (unlikely(chunk == NULL)) {
- tim_bkt_set_rem(bkt, 0);
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
tim->state = RTE_EVENT_TIMER_ERROR;
+ tim_bkt_set_rem(bkt, 0);
+ tim_bkt_dec_lock(bkt);
return -ENOMEM;
}
*chunk = *pent;
- while (tim_bkt_fetch_lock(lock_sema) !=
- (-tim_bkt_fetch_rem(lock_sema)))
- lock_sema = __atomic_load_n(&bkt->w1, __ATOMIC_ACQUIRE);
-
+ if (tim_bkt_fetch_lock(lock_sema)) {
+ do {
+ lock_sema = __atomic_load_n(&bkt->w1,
+ __ATOMIC_RELAXED);
+ } while (tim_bkt_fetch_lock(lock_sema) - 1);
+ }
+ rte_smp_rmb();
mirr_bkt->current_chunk = (uintptr_t)chunk;
__atomic_store_n(&bkt->chunk_remainder,
tim_ring->nb_chunk_slots - 1, __ATOMIC_RELEASE);
@@ -409,12 +411,12 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
*chunk = *pent;
}
- /* Copy work entry. */
- tim_bkt_inc_nent(bkt);
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = (uintptr_t)chunk;
tim->impl_opaque[1] = (uintptr_t)bkt;
tim->state = RTE_EVENT_TIMER_ARMED;
+ rte_smp_wmb();
+ tim_bkt_inc_nent(bkt);
+ tim_bkt_dec_lock_relaxed(bkt);
return 0;
}
@@ -463,6 +465,23 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
if (lock_cnt) {
tim_bkt_dec_lock(bkt);
+#ifdef RTE_ARCH_ARM64
+ asm volatile(" ldxrb %w[lock_cnt], [%[lock]] \n"
+ " tst %w[lock_cnt], 255 \n"
+ " beq dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxrb %w[lock_cnt], [%[lock]] \n"
+ " tst %w[lock_cnt], 255 \n"
+ " bne rty%= \n"
+ "dne%=: \n"
+ : [lock_cnt] "=&r"(lock_cnt)
+ : [lock] "r"(&bkt->lock)
+ : "memory");
+#else
+ while (__atomic_load_n(&bkt->lock, __ATOMIC_RELAXED))
+ ;
+#endif
goto __retry;
}
@@ -471,22 +490,20 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -563,19 +580,18 @@ tim_rm_entry(struct rte_event_timer *tim)
bkt = (struct otx2_tim_bkt *)(uintptr_t)tim->impl_opaque[1];
lock_sema = tim_bkt_inc_lock(bkt);
if (tim_bkt_get_hbt(lock_sema) || !tim_bkt_get_nent(lock_sema)) {
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
+ tim_bkt_dec_lock(bkt);
return -ENOENT;
}
entry->w0 = 0;
entry->wqe = 0;
- tim_bkt_dec_lock(bkt);
-
tim->state = RTE_EVENT_TIMER_CANCELED;
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
+ tim_bkt_dec_lock(bkt);
return 0;
}
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH 3/4] event/octeontx2: reduce chunk pool memory usage
2021-02-25 12:23 [dpdk-dev] [PATCH 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
@ 2021-02-25 12:23 ` pbhagavatula
2021-03-20 13:30 ` Jerin Jacob
2021-02-25 12:23 ` [dpdk-dev] [PATCH 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
3 siblings, 1 reply; 17+ messages in thread
From: pbhagavatula @ 2021-02-25 12:23 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reduce amount of memory used by chunk pool when the mempool used
is OCTEONTX2 NPA.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 19 ++++++++++---------
drivers/event/octeontx2/otx2_tim_evdev.h | 4 ++--
2 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index d1e967eb7..4fb002ddb 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -91,6 +91,8 @@ tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
if (cache_sz > RTE_MEMPOOL_CACHE_MAX_SIZE)
cache_sz = RTE_MEMPOOL_CACHE_MAX_SIZE;
+ cache_sz = cache_sz != 0 ? cache_sz : 2;
+ tim_ring->nb_chunks += (cache_sz * rte_lcore_count());
if (!tim_ring->disable_npa) {
tim_ring->chunk_pool = rte_mempool_create_empty(pool_name,
tim_ring->nb_chunks, tim_ring->chunk_sz,
@@ -268,16 +270,15 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
}
}
- tim_ring->nb_chunks = tim_ring->nb_timers / OTX2_TIM_NB_CHUNK_SLOTS(
- tim_ring->chunk_sz);
- tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
-
- if (tim_ring->disable_npa)
+ if (tim_ring->disable_npa) {
+ tim_ring->nb_chunks =
+ tim_ring->nb_timers /
+ OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
- else
- tim_ring->nb_chunks = tim_ring->nb_chunks + tim_ring->nb_bkts;
-
- /* Create buckets. */
+ } else {
+ tim_ring->nb_chunks = tim_ring->nb_timers;
+ }
+ tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
tim_ring->bkt = rte_zmalloc("otx2_tim_bucket", (tim_ring->nb_bkts) *
sizeof(struct otx2_tim_bkt),
RTE_CACHE_LINE_SIZE);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index bf89b85b0..2a3b84a43 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -65,12 +65,12 @@
#define OTX2_MAX_TIM_RINGS (256)
#define OTX2_TIM_MAX_BUCKETS (0xFFFFF)
-#define OTX2_TIM_RING_DEF_CHUNK_SZ (4096)
+#define OTX2_TIM_RING_DEF_CHUNK_SZ (1024)
#define OTX2_TIM_CHUNK_ALIGNMENT (16)
#define OTX2_TIM_MAX_BURST (RTE_CACHE_LINE_SIZE / \
OTX2_TIM_CHUNK_ALIGNMENT)
#define OTX2_TIM_NB_CHUNK_SLOTS(sz) (((sz) / OTX2_TIM_CHUNK_ALIGNMENT) - 1)
-#define OTX2_TIM_MIN_CHUNK_SLOTS (0x1)
+#define OTX2_TIM_MIN_CHUNK_SLOTS (0x3F)
#define OTX2_TIM_MAX_CHUNK_SLOTS (0x1FFE)
#define OTX2_TIM_MIN_TMO_TKS (256)
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH 4/4] event/octeontx2: timer always use virtual counter
2021-02-25 12:23 [dpdk-dev] [PATCH 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
@ 2021-02-25 12:23 ` pbhagavatula
2021-03-20 13:34 ` Jerin Jacob
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
3 siblings, 1 reply; 17+ messages in thread
From: pbhagavatula @ 2021-02-25 12:23 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use virtual counter for estimating current bucket as PMU cannot be
reliably used to estimate time.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 19 +++-----------
drivers/event/octeontx2/otx2_tim_evdev.h | 32 +++++++++++++++++++++++
drivers/event/octeontx2/otx2_tim_worker.c | 4 +--
drivers/event/octeontx2/otx2_tim_worker.h | 2 +-
4 files changed, 38 insertions(+), 19 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 4fb002ddb..926c2dce6 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -354,7 +354,7 @@ otx2_tim_calibrate_start_tsc(struct otx2_tim_ring *tim_ring)
for (icount = 0; icount < OTX2_TIM_CALIB_ITER; icount++) {
real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
- bkt_cyc = rte_rdtsc();
+ bkt_cyc = tim_cntvct();
bucket = (bkt_cyc - tim_ring->ring_start_cyc) /
tim_ring->tck_int;
bucket = bucket % (tim_ring->nb_bkts);
@@ -389,20 +389,8 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_err_desc(rc);
goto fail;
}
-#ifdef RTE_ARM_EAL_RDTSC_USE_PMU
- uint64_t tenns_stmp, tenns_diff;
- uint64_t pmu_stmp;
-
- pmu_stmp = rte_rdtsc();
- asm volatile("mrs %0, cntvct_el0" : "=r" (tenns_stmp));
-
- tenns_diff = tenns_stmp - rsp->timestarted;
- pmu_stmp = pmu_stmp - (NSEC2TICK(tenns_diff * 10, rte_get_timer_hz()));
- tim_ring->ring_start_cyc = pmu_stmp;
-#else
tim_ring->ring_start_cyc = rsp->timestarted;
-#endif
- tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
+ tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, tim_cntfrq());
tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
@@ -470,8 +458,7 @@ otx2_tim_stats_get(const struct rte_event_timer_adapter *adapter,
struct rte_event_timer_adapter_stats *stats)
{
struct otx2_tim_ring *tim_ring = adapter->data->adapter_priv;
- uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
-
+ uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
stats->evtim_exp_count = __atomic_load_n(&tim_ring->arm_cnt,
__ATOMIC_RELAXED);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index 2a3b84a43..a890b41dc 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -176,6 +176,38 @@ tim_priv_get(void)
return mz->addr;
}
+#ifdef RTE_ARCH_ARM64
+static inline uint64_t
+tim_cntvct(void)
+{
+ uint64_t tsc;
+
+ asm volatile("mrs %0, cntvct_el0" : "=r"(tsc));
+ return tsc;
+}
+
+static inline uint64_t
+tim_cntfrq(void)
+{
+ uint64_t freq;
+
+ asm volatile("mrs %0, cntfrq_el0" : "=r"(freq));
+ return freq;
+}
+#else
+static inline uint64_t
+tim_cntvct(void)
+{
+ return 0;
+}
+
+static inline uint64_t
+tim_cntfrq(void)
+{
+ return 0;
+}
+#endif
+
#define TIM_ARM_FASTPATH_MODES \
FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index 9946be733..b2b88cad0 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -41,12 +41,12 @@ tim_format_event(const struct rte_event_timer * const tim,
static inline void
tim_sync_start_cyc(struct otx2_tim_ring *tim_ring)
{
- uint64_t cur_cyc = rte_rdtsc();
+ uint64_t cur_cyc = tim_cntvct();
uint32_t real_bkt;
if (cur_cyc - tim_ring->last_updt_cyc > tim_ring->tot_int) {
real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
- cur_cyc = rte_rdtsc();
+ cur_cyc = tim_cntvct();
tim_ring->ring_start_cyc = cur_cyc -
(real_bkt * tim_ring->tck_int);
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index 9d8bcb974..7f6d0611e 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -132,7 +132,7 @@ tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
struct otx2_tim_bkt **mirr_bkt)
{
- const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
+ const uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
uint64_t bucket =
rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
rel_bkt;
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [dpdk-dev] [PATCH 3/4] event/octeontx2: reduce chunk pool memory usage
2021-02-25 12:23 ` [dpdk-dev] [PATCH 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
@ 2021-03-20 13:30 ` Jerin Jacob
0 siblings, 0 replies; 17+ messages in thread
From: Jerin Jacob @ 2021-03-20 13:30 UTC (permalink / raw)
To: Pavan Nikhilesh; +Cc: Jerin Jacob, dpdk-dev
On Thu, Feb 25, 2021 at 5:53 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Reduce amount of memory used by chunk pool when the mempool used
> is OCTEONTX2 NPA.
Please describe the existing and new memory allocation schemes.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
> drivers/event/octeontx2/otx2_tim_evdev.c | 19 ++++++++++---------
> drivers/event/octeontx2/otx2_tim_evdev.h | 4 ++--
> 2 files changed, 12 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
> index d1e967eb7..4fb002ddb 100644
> --- a/drivers/event/octeontx2/otx2_tim_evdev.c
> +++ b/drivers/event/octeontx2/otx2_tim_evdev.c
> @@ -91,6 +91,8 @@ tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
> if (cache_sz > RTE_MEMPOOL_CACHE_MAX_SIZE)
> cache_sz = RTE_MEMPOOL_CACHE_MAX_SIZE;
>
> + cache_sz = cache_sz != 0 ? cache_sz : 2;
> + tim_ring->nb_chunks += (cache_sz * rte_lcore_count());
> if (!tim_ring->disable_npa) {
> tim_ring->chunk_pool = rte_mempool_create_empty(pool_name,
> tim_ring->nb_chunks, tim_ring->chunk_sz,
> @@ -268,16 +270,15 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
> }
> }
>
> - tim_ring->nb_chunks = tim_ring->nb_timers / OTX2_TIM_NB_CHUNK_SLOTS(
> - tim_ring->chunk_sz);
> - tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
> -
> - if (tim_ring->disable_npa)
> + if (tim_ring->disable_npa) {
> + tim_ring->nb_chunks =
> + tim_ring->nb_timers /
> + OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
> tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
> - else
> - tim_ring->nb_chunks = tim_ring->nb_chunks + tim_ring->nb_bkts;
> -
> - /* Create buckets. */
> + } else {
> + tim_ring->nb_chunks = tim_ring->nb_timers;
> + }
> + tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
> tim_ring->bkt = rte_zmalloc("otx2_tim_bucket", (tim_ring->nb_bkts) *
> sizeof(struct otx2_tim_bkt),
> RTE_CACHE_LINE_SIZE);
> diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
> index bf89b85b0..2a3b84a43 100644
> --- a/drivers/event/octeontx2/otx2_tim_evdev.h
> +++ b/drivers/event/octeontx2/otx2_tim_evdev.h
> @@ -65,12 +65,12 @@
>
> #define OTX2_MAX_TIM_RINGS (256)
> #define OTX2_TIM_MAX_BUCKETS (0xFFFFF)
> -#define OTX2_TIM_RING_DEF_CHUNK_SZ (4096)
> +#define OTX2_TIM_RING_DEF_CHUNK_SZ (1024)
> #define OTX2_TIM_CHUNK_ALIGNMENT (16)
> #define OTX2_TIM_MAX_BURST (RTE_CACHE_LINE_SIZE / \
> OTX2_TIM_CHUNK_ALIGNMENT)
> #define OTX2_TIM_NB_CHUNK_SLOTS(sz) (((sz) / OTX2_TIM_CHUNK_ALIGNMENT) - 1)
> -#define OTX2_TIM_MIN_CHUNK_SLOTS (0x1)
> +#define OTX2_TIM_MIN_CHUNK_SLOTS (0x3F)
> #define OTX2_TIM_MAX_CHUNK_SLOTS (0x1FFE)
> #define OTX2_TIM_MIN_TMO_TKS (256)
>
> --
> 2.17.1
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [dpdk-dev] [PATCH 4/4] event/octeontx2: timer always use virtual counter
2021-02-25 12:23 ` [dpdk-dev] [PATCH 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
@ 2021-03-20 13:34 ` Jerin Jacob
2021-03-21 7:11 ` [dpdk-dev] [EXT] " Pavan Nikhilesh Bhagavatula
0 siblings, 1 reply; 17+ messages in thread
From: Jerin Jacob @ 2021-03-20 13:34 UTC (permalink / raw)
To: Pavan Nikhilesh; +Cc: Jerin Jacob, dpdk-dev
On Thu, Feb 25, 2021 at 5:53 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use virtual counter for estimating current bucket as PMU cannot be
> reliably used to estimate time.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> +#ifdef RTE_ARCH_ARM64
> +static inline uint64_t
> +tim_cntvct(void)
> +{
> + uint64_t tsc;
> +
> + asm volatile("mrs %0, cntvct_el0" : "=r"(tsc));
> + return tsc;
Reuse __rte_arm64_cntvct()
> +}
> +
> +static inline uint64_t
> +tim_cntfrq(void)
> +{
> + uint64_t freq;
> +
> + asm volatile("mrs %0, cntfrq_el0" : "=r"(freq));
> + return freq;
Reuse __rte_arm64_cntfrq()
Please fix the following checkpatch and check format errors too.
[1]
Wrong headline case:
"event/octeontx2: optimize timer arm routine":
arm --> Arm
Invalid patch(es) found - checked 4 patches
[2]
total: 0 errors, 47 warnings, 272 lines checked
Warning in drivers/event/octeontx2/otx2_tim_worker.h:
Using rte_smp_[r/w]mb
Warning in drivers/event/octeontx2/otx2_tim_worker.c:
Using __atomic_thread_fence
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [dpdk-dev] [EXT] Re: [PATCH 4/4] event/octeontx2: timer always use virtual counter
2021-03-20 13:34 ` Jerin Jacob
@ 2021-03-21 7:11 ` Pavan Nikhilesh Bhagavatula
0 siblings, 0 replies; 17+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2021-03-21 7:11 UTC (permalink / raw)
To: Jerin Jacob; +Cc: Jerin Jacob Kollanukkaran, dpdk-dev
>-----Original Message-----
>From: Jerin Jacob <jerinjacobk@gmail.com>
>Sent: Saturday, March 20, 2021 7:04 PM
>To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>
>Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; dpdk-dev
><dev@dpdk.org>
>Subject: [EXT] Re: [dpdk-dev] [PATCH 4/4] event/octeontx2: timer
>always use virtual counter
>
>External Email
>
>----------------------------------------------------------------------
>On Thu, Feb 25, 2021 at 5:53 PM <pbhagavatula@marvell.com> wrote:
>>
>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>
>> Use virtual counter for estimating current bucket as PMU cannot be
>> reliably used to estimate time.
>>
>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>>
>> +#ifdef RTE_ARCH_ARM64
>> +static inline uint64_t
>> +tim_cntvct(void)
>> +{
>> + uint64_t tsc;
>> +
>> + asm volatile("mrs %0, cntvct_el0" : "=r"(tsc));
>> + return tsc;
>
>
>Reuse __rte_arm64_cntvct()
>
>> +}
>> +
>> +static inline uint64_t
>> +tim_cntfrq(void)
>> +{
>> + uint64_t freq;
>> +
>> + asm volatile("mrs %0, cntfrq_el0" : "=r"(freq));
>> + return freq;
>
>
>Reuse __rte_arm64_cntfrq()
Ack.
>
>
>Please fix the following checkpatch and check format errors too.
>
>
>[1]
>Wrong headline case:
> "event/octeontx2: optimize timer arm routine":
>arm --> Arm
I think checkpatch is interpreting this as Arm Inc. which I think we
can ignore it in this context.
>
>Invalid patch(es) found - checked 4 patches
>
>
>[2]
>total: 0 errors, 47 warnings, 272 lines checked
>Warning in drivers/event/octeontx2/otx2_tim_worker.h:
>Using rte_smp_[r/w]mb
>Warning in drivers/event/octeontx2/otx2_tim_worker.c:
>Using __atomic_thread_fence
>
Ack.
>
>>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation
2021-02-25 12:23 [dpdk-dev] [PATCH 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
` (2 preceding siblings ...)
2021-02-25 12:23 ` [dpdk-dev] [PATCH 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
@ 2021-03-21 8:49 ` pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
` (3 more replies)
3 siblings, 4 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-21 8:49 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Simplify timer bucket estimation we need not align buckets to
power of 2 instead use reciprocal division to compute mod.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 78 ++++-----------------
drivers/event/octeontx2/otx2_tim_evdev.h | 84 ++++++++---------------
drivers/event/octeontx2/otx2_tim_worker.c | 4 +-
drivers/event/octeontx2/otx2_tim_worker.h | 40 +++++------
4 files changed, 64 insertions(+), 142 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 4c24cc8a6..d1e967eb7 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -34,27 +34,25 @@ tim_set_fp_ops(struct otx2_tim_ring *tim_ring)
{
uint8_t prod_flag = !tim_ring->prod_type_sp;
- /* [MOD/AND] [DFB/FB] [SP][MP]*/
- const rte_event_timer_arm_burst_t arm_burst[2][2][2][2] = {
-#define FP(_name, _f4, _f3, _f2, _f1, flags) \
- [_f4][_f3][_f2][_f1] = otx2_tim_arm_burst_ ## _name,
-TIM_ARM_FASTPATH_MODES
+ /* [DFB/FB] [SP][MP]*/
+ const rte_event_timer_arm_burst_t arm_burst[2][2][2] = {
+#define FP(_name, _f3, _f2, _f1, flags) \
+ [_f3][_f2][_f1] = otx2_tim_arm_burst_##_name,
+ TIM_ARM_FASTPATH_MODES
#undef FP
};
- const rte_event_timer_arm_tmo_tick_burst_t arm_tmo_burst[2][2][2] = {
-#define FP(_name, _f3, _f2, _f1, flags) \
- [_f3][_f2][_f1] = otx2_tim_arm_tmo_tick_burst_ ## _name,
-TIM_ARM_TMO_FASTPATH_MODES
+ const rte_event_timer_arm_tmo_tick_burst_t arm_tmo_burst[2][2] = {
+#define FP(_name, _f2, _f1, flags) \
+ [_f2][_f1] = otx2_tim_arm_tmo_tick_burst_##_name,
+ TIM_ARM_TMO_FASTPATH_MODES
#undef FP
};
otx2_tim_ops.arm_burst =
- arm_burst[tim_ring->enable_stats][tim_ring->optimized]
- [tim_ring->ena_dfb][prod_flag];
+ arm_burst[tim_ring->enable_stats][tim_ring->ena_dfb][prod_flag];
otx2_tim_ops.arm_tmo_tick_burst =
- arm_tmo_burst[tim_ring->enable_stats][tim_ring->optimized]
- [tim_ring->ena_dfb];
+ arm_tmo_burst[tim_ring->enable_stats][tim_ring->ena_dfb];
otx2_tim_ops.cancel_burst = otx2_tim_timer_cancel_burst;
}
@@ -70,51 +68,6 @@ otx2_tim_ring_info_get(const struct rte_event_timer_adapter *adptr,
sizeof(struct rte_event_timer_adapter_conf));
}
-static void
-tim_optimze_bkt_param(struct otx2_tim_ring *tim_ring)
-{
- uint64_t tck_nsec;
- uint32_t hbkts;
- uint32_t lbkts;
-
- hbkts = rte_align32pow2(tim_ring->nb_bkts);
- tck_nsec = RTE_ALIGN_MUL_CEIL(tim_ring->max_tout / (hbkts - 1), 10);
-
- if ((tck_nsec < TICK2NSEC(OTX2_TIM_MIN_TMO_TKS,
- tim_ring->tenns_clk_freq) ||
- hbkts > OTX2_TIM_MAX_BUCKETS))
- hbkts = 0;
-
- lbkts = rte_align32prevpow2(tim_ring->nb_bkts);
- tck_nsec = RTE_ALIGN_MUL_CEIL((tim_ring->max_tout / (lbkts - 1)), 10);
-
- if ((tck_nsec < TICK2NSEC(OTX2_TIM_MIN_TMO_TKS,
- tim_ring->tenns_clk_freq) ||
- lbkts > OTX2_TIM_MAX_BUCKETS))
- lbkts = 0;
-
- if (!hbkts && !lbkts)
- return;
-
- if (!hbkts) {
- tim_ring->nb_bkts = lbkts;
- goto end;
- } else if (!lbkts) {
- tim_ring->nb_bkts = hbkts;
- goto end;
- }
-
- tim_ring->nb_bkts = (hbkts - tim_ring->nb_bkts) <
- (tim_ring->nb_bkts - lbkts) ? hbkts : lbkts;
-end:
- tim_ring->optimized = true;
- tim_ring->tck_nsec = RTE_ALIGN_MUL_CEIL((tim_ring->max_tout /
- (tim_ring->nb_bkts - 1)), 10);
- otx2_tim_dbg("Optimized configured values");
- otx2_tim_dbg("Nb_bkts : %" PRIu32 "", tim_ring->nb_bkts);
- otx2_tim_dbg("Tck_nsec : %" PRIu64 "", tim_ring->tck_nsec);
-}
-
static int
tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
struct rte_event_timer_adapter_conf *rcfg)
@@ -319,14 +272,6 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
tim_ring->chunk_sz);
tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
- /* Try to optimize the bucket parameters. */
- if ((rcfg->flags & RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES)) {
- if (rte_is_power_of_2(tim_ring->nb_bkts))
- tim_ring->optimized = true;
- else
- tim_optimze_bkt_param(tim_ring);
- }
-
if (tim_ring->disable_npa)
tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
else
@@ -459,6 +404,7 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
+ tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
otx2_tim_calibrate_start_tsc(tim_ring);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index 44e3c7b51..bf89b85b0 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -76,8 +76,6 @@
#define OTX2_TIM_SP 0x1
#define OTX2_TIM_MP 0x2
-#define OTX2_TIM_BKT_AND 0x4
-#define OTX2_TIM_BKT_MOD 0x8
#define OTX2_TIM_ENA_FB 0x10
#define OTX2_TIM_ENA_DFB 0x20
#define OTX2_TIM_ENA_STATS 0x40
@@ -149,11 +147,11 @@ struct otx2_tim_ring {
struct otx2_tim_bkt *bkt;
struct rte_mempool *chunk_pool;
struct rte_reciprocal_u64 fast_div;
+ struct rte_reciprocal_u64 fast_bkt;
uint64_t arm_cnt;
uint8_t prod_type_sp;
uint8_t enable_stats;
uint8_t disable_npa;
- uint8_t optimized;
uint8_t ena_dfb;
uint16_t ring_id;
uint32_t aura;
@@ -178,60 +176,38 @@ tim_priv_get(void)
return mz->addr;
}
-#define TIM_ARM_FASTPATH_MODES \
-FP(mod_sp, 0, 0, 0, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(mod_mp, 0, 0, 0, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(mod_fb_sp, 0, 0, 1, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(mod_fb_mp, 0, 0, 1, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(and_sp, 0, 1, 0, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(and_mp, 0, 1, 0, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(and_fb_sp, 0, 1, 1, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(and_fb_mp, 0, 1, 1, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(stats_mod_sp, 1, 0, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(stats_mod_mp, 1, 0, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(stats_mod_fb_sp, 1, 0, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(stats_mod_fb_mp, 1, 0, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(stats_and_sp, 1, 1, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(stats_and_mp, 1, 1, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(stats_and_fb_sp, 1, 1, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(stats_and_fb_mp, 1, 1, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB | OTX2_TIM_MP)
-
-#define TIM_ARM_TMO_FASTPATH_MODES \
-FP(mod, 0, 0, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB) \
-FP(mod_fb, 0, 0, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB) \
-FP(and, 0, 1, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB) \
-FP(and_fb, 0, 1, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB) \
-FP(stats_mod, 1, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB) \
-FP(stats_mod_fb, 1, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB) \
-FP(stats_and, 1, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB) \
-FP(stats_and_fb, 1, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB)
-
-#define FP(_name, _f4, _f3, _f2, _f1, flags) \
-uint16_t \
-otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
- struct rte_event_timer **tim, \
- const uint16_t nb_timers);
+#define TIM_ARM_FASTPATH_MODES \
+ FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
+ FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
+ FP(fb_sp, 0, 1, 0, OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
+ FP(fb_mp, 0, 1, 1, OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
+ FP(stats_mod_sp, 1, 0, 0, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
+ FP(stats_mod_mp, 1, 0, 1, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
+ FP(stats_mod_fb_sp, 1, 1, 0, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
+ FP(stats_mod_fb_mp, 1, 1, 1, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB | OTX2_TIM_MP)
+
+#define TIM_ARM_TMO_FASTPATH_MODES \
+ FP(dfb, 0, 0, OTX2_TIM_ENA_DFB) \
+ FP(fb, 0, 1, OTX2_TIM_ENA_FB) \
+ FP(stats_dfb, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB) \
+ FP(stats_fb, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB)
+
+#define FP(_name, _f3, _f2, _f1, flags) \
+ uint16_t otx2_tim_arm_burst_##_name( \
+ const struct rte_event_timer_adapter *adptr, \
+ struct rte_event_timer **tim, const uint16_t nb_timers);
TIM_ARM_FASTPATH_MODES
#undef FP
-#define FP(_name, _f3, _f2, _f1, flags) \
-uint16_t \
-otx2_tim_arm_tmo_tick_burst_ ## _name( \
- const struct rte_event_timer_adapter *adptr, \
- struct rte_event_timer **tim, \
- const uint64_t timeout_tick, const uint16_t nb_timers);
+#define FP(_name, _f2, _f1, flags) \
+ uint16_t otx2_tim_arm_tmo_tick_burst_##_name( \
+ const struct rte_event_timer_adapter *adptr, \
+ struct rte_event_timer **tim, const uint64_t timeout_tick, \
+ const uint16_t nb_timers);
TIM_ARM_TMO_FASTPATH_MODES
#undef FP
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index 4b5cfdc72..eb901844d 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -136,7 +136,7 @@ tim_timer_arm_tmo_brst(const struct rte_event_timer_adapter *adptr,
return set_timers;
}
-#define FP(_name, _f4, _f3, _f2, _f1, _flags) \
+#define FP(_name, _f3, _f2, _f1, _flags) \
uint16_t __rte_noinline \
otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
struct rte_event_timer **tim, \
@@ -147,7 +147,7 @@ otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
TIM_ARM_FASTPATH_MODES
#undef FP
-#define FP(_name, _f3, _f2, _f1, _flags) \
+#define FP(_name, _f2, _f1, _flags) \
uint16_t __rte_noinline \
otx2_tim_arm_tmo_tick_burst_ ## _name( \
const struct rte_event_timer_adapter *adptr, \
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index af2f864d7..f03912b81 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -115,27 +115,27 @@ tim_bkt_clr_nent(struct otx2_tim_bkt *bktp)
return __atomic_and_fetch(&bktp->w1, v, __ATOMIC_ACQ_REL);
}
+static inline uint64_t
+tim_bkt_fast_mod(uint64_t n, uint64_t d, struct rte_reciprocal_u64 R)
+{
+ return (n - (d * rte_reciprocal_divide_u64(n, &R)));
+}
+
static __rte_always_inline void
-tim_get_target_bucket(struct otx2_tim_ring * const tim_ring,
+tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
- struct otx2_tim_bkt **mirr_bkt, const uint8_t flag)
+ struct otx2_tim_bkt **mirr_bkt)
{
const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
- uint32_t bucket = rte_reciprocal_divide_u64(bkt_cyc,
- &tim_ring->fast_div) + rel_bkt;
- uint32_t mirr_bucket = 0;
-
- if (flag & OTX2_TIM_BKT_MOD) {
- bucket = bucket % tim_ring->nb_bkts;
- mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) %
- tim_ring->nb_bkts;
- }
- if (flag & OTX2_TIM_BKT_AND) {
- bucket = bucket & (tim_ring->nb_bkts - 1);
- mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) &
- (tim_ring->nb_bkts - 1);
- }
-
+ uint64_t bucket =
+ rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
+ rel_bkt;
+ uint64_t mirr_bucket = 0;
+
+ bucket =
+ tim_bkt_fast_mod(bucket, tim_ring->nb_bkts, tim_ring->fast_bkt);
+ mirr_bucket = tim_bkt_fast_mod(bucket + (tim_ring->nb_bkts >> 1),
+ tim_ring->nb_bkts, tim_ring->fast_bkt);
*bkt = &tim_ring->bkt[bucket];
*mirr_bkt = &tim_ring->bkt[mirr_bucket];
}
@@ -236,7 +236,7 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
int16_t rem;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -322,7 +322,7 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
int16_t rem;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -454,7 +454,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
uint8_t lock_cnt;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Only one thread beyond this. */
lock_sema = tim_bkt_inc_lock(bkt);
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v2 2/4] event/octeontx2: optimize timer arm routine
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
@ 2021-03-21 8:49 ` pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
` (2 subsequent siblings)
3 siblings, 0 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-21 8:49 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use relaxed load exclusive when polling for other threads or
hardware to complete.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_worker.c | 1 +
drivers/event/octeontx2/otx2_tim_worker.h | 162 ++++++++++++----------
2 files changed, 89 insertions(+), 74 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index eb901844d..6a3511ec0 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -170,6 +170,7 @@ otx2_tim_timer_cancel_burst(const struct rte_event_timer_adapter *adptr,
int ret;
RTE_SET_USED(adptr);
+ rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
for (index = 0; index < nb_timers; index++) {
if (tim[index]->state == RTE_EVENT_TIMER_CANCELED) {
rte_errno = EALREADY;
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index f03912b81..aa69da96c 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -84,7 +84,13 @@ tim_bkt_inc_lock(struct otx2_tim_bkt *bktp)
static inline void
tim_bkt_dec_lock(struct otx2_tim_bkt *bktp)
{
- __atomic_add_fetch(&bktp->lock, 0xff, __ATOMIC_RELEASE);
+ __atomic_fetch_sub(&bktp->lock, 1, __ATOMIC_RELEASE);
+}
+
+static inline void
+tim_bkt_dec_lock_relaxed(struct otx2_tim_bkt *bktp)
+{
+ __atomic_fetch_sub(&bktp->lock, 1, __ATOMIC_RELAXED);
}
static inline uint32_t
@@ -246,22 +252,20 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -282,10 +286,10 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
if (unlikely(chunk == NULL)) {
bkt->chunk_remainder = 0;
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
tim->state = RTE_EVENT_TIMER_ERROR;
+ tim_bkt_dec_lock(bkt);
return -ENOMEM;
}
mirr_bkt->current_chunk = (uintptr_t)chunk;
@@ -298,12 +302,11 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
/* Copy work entry. */
*chunk = *pent;
- tim_bkt_inc_nent(bkt);
- tim_bkt_dec_lock(bkt);
-
tim->impl_opaque[0] = (uintptr_t)chunk;
tim->impl_opaque[1] = (uintptr_t)bkt;
- tim->state = RTE_EVENT_TIMER_ARMED;
+ __atomic_store_n(&tim->state, RTE_EVENT_TIMER_ARMED, __ATOMIC_RELEASE);
+ tim_bkt_inc_nent(bkt);
+ tim_bkt_dec_lock_relaxed(bkt);
return 0;
}
@@ -331,22 +334,20 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -359,26 +360,23 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
rem = tim_bkt_fetch_rem(lock_sema);
if (rem < 0) {
+ tim_bkt_dec_lock(bkt);
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxrh %w[rem], [%[crem]] \n"
- " tbz %w[rem], 15, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxrh %w[rem], [%[crem]] \n"
- " tbnz %w[rem], 15, rty%= \n"
- "dne%=: \n"
- : [rem] "=&r" (rem)
- : [crem] "r" (&bkt->chunk_remainder)
- : "memory"
- );
+ asm volatile(" ldxr %[rem], [%[crem]] \n"
+ " tbz %[rem], 63, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[rem], [%[crem]] \n"
+ " tbnz %[rem], 63, rty%= \n"
+ "dne%=: \n"
+ : [rem] "=&r"(rem)
+ : [crem] "r"(&bkt->w1)
+ : "memory");
#else
- while (__atomic_load_n(&bkt->chunk_remainder,
- __ATOMIC_ACQUIRE) < 0)
+ while (__atomic_load_n((int64_t *)&bkt->w1, __ATOMIC_RELAXED) <
+ 0)
;
#endif
- /* Goto diff bucket. */
- tim_bkt_dec_lock(bkt);
goto __retry;
} else if (!rem) {
/* Only one thread can be here*/
@@ -388,18 +386,21 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring);
if (unlikely(chunk == NULL)) {
- tim_bkt_set_rem(bkt, 0);
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
tim->state = RTE_EVENT_TIMER_ERROR;
+ tim_bkt_set_rem(bkt, 0);
+ tim_bkt_dec_lock(bkt);
return -ENOMEM;
}
*chunk = *pent;
- while (tim_bkt_fetch_lock(lock_sema) !=
- (-tim_bkt_fetch_rem(lock_sema)))
- lock_sema = __atomic_load_n(&bkt->w1, __ATOMIC_ACQUIRE);
-
+ if (tim_bkt_fetch_lock(lock_sema)) {
+ do {
+ lock_sema = __atomic_load_n(&bkt->w1,
+ __ATOMIC_RELAXED);
+ } while (tim_bkt_fetch_lock(lock_sema) - 1);
+ rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
+ }
mirr_bkt->current_chunk = (uintptr_t)chunk;
__atomic_store_n(&bkt->chunk_remainder,
tim_ring->nb_chunk_slots - 1, __ATOMIC_RELEASE);
@@ -409,12 +410,11 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
*chunk = *pent;
}
- /* Copy work entry. */
- tim_bkt_inc_nent(bkt);
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = (uintptr_t)chunk;
tim->impl_opaque[1] = (uintptr_t)bkt;
- tim->state = RTE_EVENT_TIMER_ARMED;
+ __atomic_store_n(&tim->state, RTE_EVENT_TIMER_ARMED, __ATOMIC_RELEASE);
+ tim_bkt_inc_nent(bkt);
+ tim_bkt_dec_lock_relaxed(bkt);
return 0;
}
@@ -463,6 +463,23 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
if (lock_cnt) {
tim_bkt_dec_lock(bkt);
+#ifdef RTE_ARCH_ARM64
+ asm volatile(" ldxrb %w[lock_cnt], [%[lock]] \n"
+ " tst %w[lock_cnt], 255 \n"
+ " beq dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxrb %w[lock_cnt], [%[lock]] \n"
+ " tst %w[lock_cnt], 255 \n"
+ " bne rty%= \n"
+ "dne%=: \n"
+ : [lock_cnt] "=&r"(lock_cnt)
+ : [lock] "r"(&bkt->lock)
+ : "memory");
+#else
+ while (__atomic_load_n(&bkt->lock, __ATOMIC_RELAXED))
+ ;
+#endif
goto __retry;
}
@@ -471,22 +488,20 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -563,19 +578,18 @@ tim_rm_entry(struct rte_event_timer *tim)
bkt = (struct otx2_tim_bkt *)(uintptr_t)tim->impl_opaque[1];
lock_sema = tim_bkt_inc_lock(bkt);
if (tim_bkt_get_hbt(lock_sema) || !tim_bkt_get_nent(lock_sema)) {
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
+ tim_bkt_dec_lock(bkt);
return -ENOENT;
}
entry->w0 = 0;
entry->wqe = 0;
- tim_bkt_dec_lock(bkt);
-
tim->state = RTE_EVENT_TIMER_CANCELED;
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
+ tim_bkt_dec_lock(bkt);
return 0;
}
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v2 3/4] event/octeontx2: reduce chunk pool memory usage
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
@ 2021-03-21 8:49 ` pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
3 siblings, 0 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-21 8:49 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reduce amount of memory used by chunk pool when the mempool used
is OCTEONTX2 NPA.
Previously, the number of chunks configured when NPA is used is
equal to the number of timers requested plus the number of buckets
and if the max timeout is long enough w.r.t. resolution requested
there will a large number of buckets which would cause high memory
usage.
Reduce the number of chunks when NPA is used to the number of timers
requested as buckets that are processed chunk lists are automatically
freed.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 19 ++++++++++---------
drivers/event/octeontx2/otx2_tim_evdev.h | 2 +-
2 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index d1e967eb7..4fb002ddb 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -91,6 +91,8 @@ tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
if (cache_sz > RTE_MEMPOOL_CACHE_MAX_SIZE)
cache_sz = RTE_MEMPOOL_CACHE_MAX_SIZE;
+ cache_sz = cache_sz != 0 ? cache_sz : 2;
+ tim_ring->nb_chunks += (cache_sz * rte_lcore_count());
if (!tim_ring->disable_npa) {
tim_ring->chunk_pool = rte_mempool_create_empty(pool_name,
tim_ring->nb_chunks, tim_ring->chunk_sz,
@@ -268,16 +270,15 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
}
}
- tim_ring->nb_chunks = tim_ring->nb_timers / OTX2_TIM_NB_CHUNK_SLOTS(
- tim_ring->chunk_sz);
- tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
-
- if (tim_ring->disable_npa)
+ if (tim_ring->disable_npa) {
+ tim_ring->nb_chunks =
+ tim_ring->nb_timers /
+ OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
- else
- tim_ring->nb_chunks = tim_ring->nb_chunks + tim_ring->nb_bkts;
-
- /* Create buckets. */
+ } else {
+ tim_ring->nb_chunks = tim_ring->nb_timers;
+ }
+ tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
tim_ring->bkt = rte_zmalloc("otx2_tim_bucket", (tim_ring->nb_bkts) *
sizeof(struct otx2_tim_bkt),
RTE_CACHE_LINE_SIZE);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index bf89b85b0..0667d4576 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -70,7 +70,7 @@
#define OTX2_TIM_MAX_BURST (RTE_CACHE_LINE_SIZE / \
OTX2_TIM_CHUNK_ALIGNMENT)
#define OTX2_TIM_NB_CHUNK_SLOTS(sz) (((sz) / OTX2_TIM_CHUNK_ALIGNMENT) - 1)
-#define OTX2_TIM_MIN_CHUNK_SLOTS (0x1)
+#define OTX2_TIM_MIN_CHUNK_SLOTS (0x8)
#define OTX2_TIM_MAX_CHUNK_SLOTS (0x1FFE)
#define OTX2_TIM_MIN_TMO_TKS (256)
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v2 4/4] event/octeontx2: timer always use virtual counter
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
@ 2021-03-21 8:49 ` pbhagavatula
2021-03-22 16:13 ` Jerin Jacob
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
3 siblings, 1 reply; 17+ messages in thread
From: pbhagavatula @ 2021-03-21 8:49 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use virtual counter for estimating current bucket as PMU cannot be
reliably used to estimate time.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 19 +++--------------
drivers/event/octeontx2/otx2_tim_evdev.h | 26 +++++++++++++++++++++++
drivers/event/octeontx2/otx2_tim_worker.c | 4 ++--
drivers/event/octeontx2/otx2_tim_worker.h | 2 +-
4 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 4fb002ddb..926c2dce6 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -354,7 +354,7 @@ otx2_tim_calibrate_start_tsc(struct otx2_tim_ring *tim_ring)
for (icount = 0; icount < OTX2_TIM_CALIB_ITER; icount++) {
real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
- bkt_cyc = rte_rdtsc();
+ bkt_cyc = tim_cntvct();
bucket = (bkt_cyc - tim_ring->ring_start_cyc) /
tim_ring->tck_int;
bucket = bucket % (tim_ring->nb_bkts);
@@ -389,20 +389,8 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_err_desc(rc);
goto fail;
}
-#ifdef RTE_ARM_EAL_RDTSC_USE_PMU
- uint64_t tenns_stmp, tenns_diff;
- uint64_t pmu_stmp;
-
- pmu_stmp = rte_rdtsc();
- asm volatile("mrs %0, cntvct_el0" : "=r" (tenns_stmp));
-
- tenns_diff = tenns_stmp - rsp->timestarted;
- pmu_stmp = pmu_stmp - (NSEC2TICK(tenns_diff * 10, rte_get_timer_hz()));
- tim_ring->ring_start_cyc = pmu_stmp;
-#else
tim_ring->ring_start_cyc = rsp->timestarted;
-#endif
- tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
+ tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, tim_cntfrq());
tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
@@ -470,8 +458,7 @@ otx2_tim_stats_get(const struct rte_event_timer_adapter *adapter,
struct rte_event_timer_adapter_stats *stats)
{
struct otx2_tim_ring *tim_ring = adapter->data->adapter_priv;
- uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
-
+ uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
stats->evtim_exp_count = __atomic_load_n(&tim_ring->arm_cnt,
__ATOMIC_RELAXED);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index 0667d4576..410880e14 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -176,6 +176,32 @@ tim_priv_get(void)
return mz->addr;
}
+#ifdef RTE_ARCH_ARM64
+static inline uint64_t
+tim_cntvct(void)
+{
+ return __rte_arm64_cntvct();
+}
+
+static inline uint64_t
+tim_cntfrq(void)
+{
+ return __rte_arm64_cntfrq();
+}
+#else
+static inline uint64_t
+tim_cntvct(void)
+{
+ return 0;
+}
+
+static inline uint64_t
+tim_cntfrq(void)
+{
+ return 0;
+}
+#endif
+
#define TIM_ARM_FASTPATH_MODES \
FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index 6a3511ec0..9ee07958f 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -41,12 +41,12 @@ tim_format_event(const struct rte_event_timer * const tim,
static inline void
tim_sync_start_cyc(struct otx2_tim_ring *tim_ring)
{
- uint64_t cur_cyc = rte_rdtsc();
+ uint64_t cur_cyc = tim_cntvct();
uint32_t real_bkt;
if (cur_cyc - tim_ring->last_updt_cyc > tim_ring->tot_int) {
real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
- cur_cyc = rte_rdtsc();
+ cur_cyc = tim_cntvct();
tim_ring->ring_start_cyc = cur_cyc -
(real_bkt * tim_ring->tck_int);
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index aa69da96c..582b9c1b9 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -132,7 +132,7 @@ tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
struct otx2_tim_bkt **mirr_bkt)
{
- const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
+ const uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
uint64_t bucket =
rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
rel_bkt;
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [dpdk-dev] [PATCH v2 4/4] event/octeontx2: timer always use virtual counter
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
@ 2021-03-22 16:13 ` Jerin Jacob
0 siblings, 0 replies; 17+ messages in thread
From: Jerin Jacob @ 2021-03-22 16:13 UTC (permalink / raw)
To: Pavan Nikhilesh; +Cc: Jerin Jacob, dpdk-dev
On Sun, Mar 21, 2021 at 2:20 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use virtual counter for estimating current bucket as PMU cannot be
> reliably used to estimate time.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
There is a genuine Travis build issue with this patch. Please check
http://patches.dpdk.org/project/dpdk/patch/20210321084915.2649-4-pbhagavatula@marvell.com/
https://travis-ci.com/github/ovsrobot/dpdk/builds/220728709
> ---
> drivers/event/octeontx2/otx2_tim_evdev.c | 19 +++--------------
> drivers/event/octeontx2/otx2_tim_evdev.h | 26 +++++++++++++++++++++++
> drivers/event/octeontx2/otx2_tim_worker.c | 4 ++--
> drivers/event/octeontx2/otx2_tim_worker.h | 2 +-
> 4 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
> index 4fb002ddb..926c2dce6 100644
> --- a/drivers/event/octeontx2/otx2_tim_evdev.c
> +++ b/drivers/event/octeontx2/otx2_tim_evdev.c
> @@ -354,7 +354,7 @@ otx2_tim_calibrate_start_tsc(struct otx2_tim_ring *tim_ring)
>
> for (icount = 0; icount < OTX2_TIM_CALIB_ITER; icount++) {
> real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
> - bkt_cyc = rte_rdtsc();
> + bkt_cyc = tim_cntvct();
> bucket = (bkt_cyc - tim_ring->ring_start_cyc) /
> tim_ring->tck_int;
> bucket = bucket % (tim_ring->nb_bkts);
> @@ -389,20 +389,8 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
> tim_err_desc(rc);
> goto fail;
> }
> -#ifdef RTE_ARM_EAL_RDTSC_USE_PMU
> - uint64_t tenns_stmp, tenns_diff;
> - uint64_t pmu_stmp;
> -
> - pmu_stmp = rte_rdtsc();
> - asm volatile("mrs %0, cntvct_el0" : "=r" (tenns_stmp));
> -
> - tenns_diff = tenns_stmp - rsp->timestarted;
> - pmu_stmp = pmu_stmp - (NSEC2TICK(tenns_diff * 10, rte_get_timer_hz()));
> - tim_ring->ring_start_cyc = pmu_stmp;
> -#else
> tim_ring->ring_start_cyc = rsp->timestarted;
> -#endif
> - tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
> + tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, tim_cntfrq());
> tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
> tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
> tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
> @@ -470,8 +458,7 @@ otx2_tim_stats_get(const struct rte_event_timer_adapter *adapter,
> struct rte_event_timer_adapter_stats *stats)
> {
> struct otx2_tim_ring *tim_ring = adapter->data->adapter_priv;
> - uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
> -
> + uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
>
> stats->evtim_exp_count = __atomic_load_n(&tim_ring->arm_cnt,
> __ATOMIC_RELAXED);
> diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
> index 0667d4576..410880e14 100644
> --- a/drivers/event/octeontx2/otx2_tim_evdev.h
> +++ b/drivers/event/octeontx2/otx2_tim_evdev.h
> @@ -176,6 +176,32 @@ tim_priv_get(void)
> return mz->addr;
> }
>
> +#ifdef RTE_ARCH_ARM64
> +static inline uint64_t
> +tim_cntvct(void)
> +{
> + return __rte_arm64_cntvct();
> +}
> +
> +static inline uint64_t
> +tim_cntfrq(void)
> +{
> + return __rte_arm64_cntfrq();
> +}
> +#else
> +static inline uint64_t
> +tim_cntvct(void)
> +{
> + return 0;
> +}
> +
> +static inline uint64_t
> +tim_cntfrq(void)
> +{
> + return 0;
> +}
> +#endif
> +
> #define TIM_ARM_FASTPATH_MODES \
> FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
> FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
> diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
> index 6a3511ec0..9ee07958f 100644
> --- a/drivers/event/octeontx2/otx2_tim_worker.c
> +++ b/drivers/event/octeontx2/otx2_tim_worker.c
> @@ -41,12 +41,12 @@ tim_format_event(const struct rte_event_timer * const tim,
> static inline void
> tim_sync_start_cyc(struct otx2_tim_ring *tim_ring)
> {
> - uint64_t cur_cyc = rte_rdtsc();
> + uint64_t cur_cyc = tim_cntvct();
> uint32_t real_bkt;
>
> if (cur_cyc - tim_ring->last_updt_cyc > tim_ring->tot_int) {
> real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
> - cur_cyc = rte_rdtsc();
> + cur_cyc = tim_cntvct();
>
> tim_ring->ring_start_cyc = cur_cyc -
> (real_bkt * tim_ring->tck_int);
> diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
> index aa69da96c..582b9c1b9 100644
> --- a/drivers/event/octeontx2/otx2_tim_worker.h
> +++ b/drivers/event/octeontx2/otx2_tim_worker.h
> @@ -132,7 +132,7 @@ tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
> const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
> struct otx2_tim_bkt **mirr_bkt)
> {
> - const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
> + const uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
> uint64_t bucket =
> rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
> rel_bkt;
> --
> 2.17.1
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
` (2 preceding siblings ...)
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
@ 2021-03-23 8:44 ` pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
` (3 more replies)
3 siblings, 4 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-23 8:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Simplify timer bucket estimation we need not align buckets to
power of 2 instead use reciprocal division to compute mod.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/event/octeontx2/otx2_tim_evdev.c | 78 ++++-----------------
drivers/event/octeontx2/otx2_tim_evdev.h | 84 ++++++++---------------
drivers/event/octeontx2/otx2_tim_worker.c | 4 +-
drivers/event/octeontx2/otx2_tim_worker.h | 40 +++++------
4 files changed, 64 insertions(+), 142 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 4c24cc8a6..d1e967eb7 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -34,27 +34,25 @@ tim_set_fp_ops(struct otx2_tim_ring *tim_ring)
{
uint8_t prod_flag = !tim_ring->prod_type_sp;
- /* [MOD/AND] [DFB/FB] [SP][MP]*/
- const rte_event_timer_arm_burst_t arm_burst[2][2][2][2] = {
-#define FP(_name, _f4, _f3, _f2, _f1, flags) \
- [_f4][_f3][_f2][_f1] = otx2_tim_arm_burst_ ## _name,
-TIM_ARM_FASTPATH_MODES
+ /* [DFB/FB] [SP][MP]*/
+ const rte_event_timer_arm_burst_t arm_burst[2][2][2] = {
+#define FP(_name, _f3, _f2, _f1, flags) \
+ [_f3][_f2][_f1] = otx2_tim_arm_burst_##_name,
+ TIM_ARM_FASTPATH_MODES
#undef FP
};
- const rte_event_timer_arm_tmo_tick_burst_t arm_tmo_burst[2][2][2] = {
-#define FP(_name, _f3, _f2, _f1, flags) \
- [_f3][_f2][_f1] = otx2_tim_arm_tmo_tick_burst_ ## _name,
-TIM_ARM_TMO_FASTPATH_MODES
+ const rte_event_timer_arm_tmo_tick_burst_t arm_tmo_burst[2][2] = {
+#define FP(_name, _f2, _f1, flags) \
+ [_f2][_f1] = otx2_tim_arm_tmo_tick_burst_##_name,
+ TIM_ARM_TMO_FASTPATH_MODES
#undef FP
};
otx2_tim_ops.arm_burst =
- arm_burst[tim_ring->enable_stats][tim_ring->optimized]
- [tim_ring->ena_dfb][prod_flag];
+ arm_burst[tim_ring->enable_stats][tim_ring->ena_dfb][prod_flag];
otx2_tim_ops.arm_tmo_tick_burst =
- arm_tmo_burst[tim_ring->enable_stats][tim_ring->optimized]
- [tim_ring->ena_dfb];
+ arm_tmo_burst[tim_ring->enable_stats][tim_ring->ena_dfb];
otx2_tim_ops.cancel_burst = otx2_tim_timer_cancel_burst;
}
@@ -70,51 +68,6 @@ otx2_tim_ring_info_get(const struct rte_event_timer_adapter *adptr,
sizeof(struct rte_event_timer_adapter_conf));
}
-static void
-tim_optimze_bkt_param(struct otx2_tim_ring *tim_ring)
-{
- uint64_t tck_nsec;
- uint32_t hbkts;
- uint32_t lbkts;
-
- hbkts = rte_align32pow2(tim_ring->nb_bkts);
- tck_nsec = RTE_ALIGN_MUL_CEIL(tim_ring->max_tout / (hbkts - 1), 10);
-
- if ((tck_nsec < TICK2NSEC(OTX2_TIM_MIN_TMO_TKS,
- tim_ring->tenns_clk_freq) ||
- hbkts > OTX2_TIM_MAX_BUCKETS))
- hbkts = 0;
-
- lbkts = rte_align32prevpow2(tim_ring->nb_bkts);
- tck_nsec = RTE_ALIGN_MUL_CEIL((tim_ring->max_tout / (lbkts - 1)), 10);
-
- if ((tck_nsec < TICK2NSEC(OTX2_TIM_MIN_TMO_TKS,
- tim_ring->tenns_clk_freq) ||
- lbkts > OTX2_TIM_MAX_BUCKETS))
- lbkts = 0;
-
- if (!hbkts && !lbkts)
- return;
-
- if (!hbkts) {
- tim_ring->nb_bkts = lbkts;
- goto end;
- } else if (!lbkts) {
- tim_ring->nb_bkts = hbkts;
- goto end;
- }
-
- tim_ring->nb_bkts = (hbkts - tim_ring->nb_bkts) <
- (tim_ring->nb_bkts - lbkts) ? hbkts : lbkts;
-end:
- tim_ring->optimized = true;
- tim_ring->tck_nsec = RTE_ALIGN_MUL_CEIL((tim_ring->max_tout /
- (tim_ring->nb_bkts - 1)), 10);
- otx2_tim_dbg("Optimized configured values");
- otx2_tim_dbg("Nb_bkts : %" PRIu32 "", tim_ring->nb_bkts);
- otx2_tim_dbg("Tck_nsec : %" PRIu64 "", tim_ring->tck_nsec);
-}
-
static int
tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
struct rte_event_timer_adapter_conf *rcfg)
@@ -319,14 +272,6 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
tim_ring->chunk_sz);
tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
- /* Try to optimize the bucket parameters. */
- if ((rcfg->flags & RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES)) {
- if (rte_is_power_of_2(tim_ring->nb_bkts))
- tim_ring->optimized = true;
- else
- tim_optimze_bkt_param(tim_ring);
- }
-
if (tim_ring->disable_npa)
tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
else
@@ -459,6 +404,7 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
+ tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
otx2_tim_calibrate_start_tsc(tim_ring);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index 44e3c7b51..bf89b85b0 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -76,8 +76,6 @@
#define OTX2_TIM_SP 0x1
#define OTX2_TIM_MP 0x2
-#define OTX2_TIM_BKT_AND 0x4
-#define OTX2_TIM_BKT_MOD 0x8
#define OTX2_TIM_ENA_FB 0x10
#define OTX2_TIM_ENA_DFB 0x20
#define OTX2_TIM_ENA_STATS 0x40
@@ -149,11 +147,11 @@ struct otx2_tim_ring {
struct otx2_tim_bkt *bkt;
struct rte_mempool *chunk_pool;
struct rte_reciprocal_u64 fast_div;
+ struct rte_reciprocal_u64 fast_bkt;
uint64_t arm_cnt;
uint8_t prod_type_sp;
uint8_t enable_stats;
uint8_t disable_npa;
- uint8_t optimized;
uint8_t ena_dfb;
uint16_t ring_id;
uint32_t aura;
@@ -178,60 +176,38 @@ tim_priv_get(void)
return mz->addr;
}
-#define TIM_ARM_FASTPATH_MODES \
-FP(mod_sp, 0, 0, 0, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(mod_mp, 0, 0, 0, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(mod_fb_sp, 0, 0, 1, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(mod_fb_mp, 0, 0, 1, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(and_sp, 0, 1, 0, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(and_mp, 0, 1, 0, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(and_fb_sp, 0, 1, 1, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(and_fb_mp, 0, 1, 1, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(stats_mod_sp, 1, 0, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(stats_mod_mp, 1, 0, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(stats_mod_fb_sp, 1, 0, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(stats_mod_fb_mp, 1, 0, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
-FP(stats_and_sp, 1, 1, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
-FP(stats_and_mp, 1, 1, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
-FP(stats_and_fb_sp, 1, 1, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
-FP(stats_and_fb_mp, 1, 1, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB | OTX2_TIM_MP)
-
-#define TIM_ARM_TMO_FASTPATH_MODES \
-FP(mod, 0, 0, 0, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_DFB) \
-FP(mod_fb, 0, 0, 1, OTX2_TIM_BKT_MOD | OTX2_TIM_ENA_FB) \
-FP(and, 0, 1, 0, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_DFB) \
-FP(and_fb, 0, 1, 1, OTX2_TIM_BKT_AND | OTX2_TIM_ENA_FB) \
-FP(stats_mod, 1, 0, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_DFB) \
-FP(stats_mod_fb, 1, 0, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_MOD | \
- OTX2_TIM_ENA_FB) \
-FP(stats_and, 1, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_DFB) \
-FP(stats_and_fb, 1, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_BKT_AND | \
- OTX2_TIM_ENA_FB)
-
-#define FP(_name, _f4, _f3, _f2, _f1, flags) \
-uint16_t \
-otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
- struct rte_event_timer **tim, \
- const uint16_t nb_timers);
+#define TIM_ARM_FASTPATH_MODES \
+ FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
+ FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
+ FP(fb_sp, 0, 1, 0, OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
+ FP(fb_mp, 0, 1, 1, OTX2_TIM_ENA_FB | OTX2_TIM_MP) \
+ FP(stats_mod_sp, 1, 0, 0, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
+ FP(stats_mod_mp, 1, 0, 1, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
+ FP(stats_mod_fb_sp, 1, 1, 0, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB | OTX2_TIM_SP) \
+ FP(stats_mod_fb_mp, 1, 1, 1, \
+ OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB | OTX2_TIM_MP)
+
+#define TIM_ARM_TMO_FASTPATH_MODES \
+ FP(dfb, 0, 0, OTX2_TIM_ENA_DFB) \
+ FP(fb, 0, 1, OTX2_TIM_ENA_FB) \
+ FP(stats_dfb, 1, 0, OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_DFB) \
+ FP(stats_fb, 1, 1, OTX2_TIM_ENA_STATS | OTX2_TIM_ENA_FB)
+
+#define FP(_name, _f3, _f2, _f1, flags) \
+ uint16_t otx2_tim_arm_burst_##_name( \
+ const struct rte_event_timer_adapter *adptr, \
+ struct rte_event_timer **tim, const uint16_t nb_timers);
TIM_ARM_FASTPATH_MODES
#undef FP
-#define FP(_name, _f3, _f2, _f1, flags) \
-uint16_t \
-otx2_tim_arm_tmo_tick_burst_ ## _name( \
- const struct rte_event_timer_adapter *adptr, \
- struct rte_event_timer **tim, \
- const uint64_t timeout_tick, const uint16_t nb_timers);
+#define FP(_name, _f2, _f1, flags) \
+ uint16_t otx2_tim_arm_tmo_tick_burst_##_name( \
+ const struct rte_event_timer_adapter *adptr, \
+ struct rte_event_timer **tim, const uint64_t timeout_tick, \
+ const uint16_t nb_timers);
TIM_ARM_TMO_FASTPATH_MODES
#undef FP
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index 4b5cfdc72..eb901844d 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -136,7 +136,7 @@ tim_timer_arm_tmo_brst(const struct rte_event_timer_adapter *adptr,
return set_timers;
}
-#define FP(_name, _f4, _f3, _f2, _f1, _flags) \
+#define FP(_name, _f3, _f2, _f1, _flags) \
uint16_t __rte_noinline \
otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
struct rte_event_timer **tim, \
@@ -147,7 +147,7 @@ otx2_tim_arm_burst_ ## _name(const struct rte_event_timer_adapter *adptr, \
TIM_ARM_FASTPATH_MODES
#undef FP
-#define FP(_name, _f3, _f2, _f1, _flags) \
+#define FP(_name, _f2, _f1, _flags) \
uint16_t __rte_noinline \
otx2_tim_arm_tmo_tick_burst_ ## _name( \
const struct rte_event_timer_adapter *adptr, \
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index af2f864d7..f03912b81 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -115,27 +115,27 @@ tim_bkt_clr_nent(struct otx2_tim_bkt *bktp)
return __atomic_and_fetch(&bktp->w1, v, __ATOMIC_ACQ_REL);
}
+static inline uint64_t
+tim_bkt_fast_mod(uint64_t n, uint64_t d, struct rte_reciprocal_u64 R)
+{
+ return (n - (d * rte_reciprocal_divide_u64(n, &R)));
+}
+
static __rte_always_inline void
-tim_get_target_bucket(struct otx2_tim_ring * const tim_ring,
+tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
- struct otx2_tim_bkt **mirr_bkt, const uint8_t flag)
+ struct otx2_tim_bkt **mirr_bkt)
{
const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
- uint32_t bucket = rte_reciprocal_divide_u64(bkt_cyc,
- &tim_ring->fast_div) + rel_bkt;
- uint32_t mirr_bucket = 0;
-
- if (flag & OTX2_TIM_BKT_MOD) {
- bucket = bucket % tim_ring->nb_bkts;
- mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) %
- tim_ring->nb_bkts;
- }
- if (flag & OTX2_TIM_BKT_AND) {
- bucket = bucket & (tim_ring->nb_bkts - 1);
- mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) &
- (tim_ring->nb_bkts - 1);
- }
-
+ uint64_t bucket =
+ rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
+ rel_bkt;
+ uint64_t mirr_bucket = 0;
+
+ bucket =
+ tim_bkt_fast_mod(bucket, tim_ring->nb_bkts, tim_ring->fast_bkt);
+ mirr_bucket = tim_bkt_fast_mod(bucket + (tim_ring->nb_bkts >> 1),
+ tim_ring->nb_bkts, tim_ring->fast_bkt);
*bkt = &tim_ring->bkt[bucket];
*mirr_bkt = &tim_ring->bkt[mirr_bucket];
}
@@ -236,7 +236,7 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
int16_t rem;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -322,7 +322,7 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
int16_t rem;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Get Bucket sema*/
lock_sema = tim_bkt_fetch_sema_lock(bkt);
@@ -454,7 +454,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
uint8_t lock_cnt;
__retry:
- tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags);
+ tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt);
/* Only one thread beyond this. */
lock_sema = tim_bkt_inc_lock(bkt);
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v3 2/4] event/octeontx2: optimize timer arm routine
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
@ 2021-03-23 8:44 ` pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
` (2 subsequent siblings)
3 siblings, 0 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-23 8:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use relaxed load exclusive when polling for other threads or
hardware to complete.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
v3 Changes:
- Fix incorrect asm register usage detected by clang.
drivers/event/octeontx2/otx2_tim_worker.c | 1 +
drivers/event/octeontx2/otx2_tim_worker.h | 163 ++++++++++++----------
2 files changed, 90 insertions(+), 74 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index eb901844d..6a3511ec0 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -170,6 +170,7 @@ otx2_tim_timer_cancel_burst(const struct rte_event_timer_adapter *adptr,
int ret;
RTE_SET_USED(adptr);
+ rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
for (index = 0; index < nb_timers; index++) {
if (tim[index]->state == RTE_EVENT_TIMER_CANCELED) {
rte_errno = EALREADY;
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index f03912b81..5ece8fd05 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -84,7 +84,13 @@ tim_bkt_inc_lock(struct otx2_tim_bkt *bktp)
static inline void
tim_bkt_dec_lock(struct otx2_tim_bkt *bktp)
{
- __atomic_add_fetch(&bktp->lock, 0xff, __ATOMIC_RELEASE);
+ __atomic_fetch_sub(&bktp->lock, 1, __ATOMIC_RELEASE);
+}
+
+static inline void
+tim_bkt_dec_lock_relaxed(struct otx2_tim_bkt *bktp)
+{
+ __atomic_fetch_sub(&bktp->lock, 1, __ATOMIC_RELAXED);
}
static inline uint32_t
@@ -246,22 +252,20 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -282,10 +286,10 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
if (unlikely(chunk == NULL)) {
bkt->chunk_remainder = 0;
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
tim->state = RTE_EVENT_TIMER_ERROR;
+ tim_bkt_dec_lock(bkt);
return -ENOMEM;
}
mirr_bkt->current_chunk = (uintptr_t)chunk;
@@ -298,12 +302,11 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring,
/* Copy work entry. */
*chunk = *pent;
- tim_bkt_inc_nent(bkt);
- tim_bkt_dec_lock(bkt);
-
tim->impl_opaque[0] = (uintptr_t)chunk;
tim->impl_opaque[1] = (uintptr_t)bkt;
- tim->state = RTE_EVENT_TIMER_ARMED;
+ __atomic_store_n(&tim->state, RTE_EVENT_TIMER_ARMED, __ATOMIC_RELEASE);
+ tim_bkt_inc_nent(bkt);
+ tim_bkt_dec_lock_relaxed(bkt);
return 0;
}
@@ -331,22 +334,20 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -359,26 +360,24 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
rem = tim_bkt_fetch_rem(lock_sema);
if (rem < 0) {
+ tim_bkt_dec_lock(bkt);
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxrh %w[rem], [%[crem]] \n"
- " tbz %w[rem], 15, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxrh %w[rem], [%[crem]] \n"
- " tbnz %w[rem], 15, rty%= \n"
- "dne%=: \n"
- : [rem] "=&r" (rem)
- : [crem] "r" (&bkt->chunk_remainder)
- : "memory"
- );
+ uint64_t w1;
+ asm volatile(" ldxr %[w1], [%[crem]] \n"
+ " tbz %[w1], 63, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[w1], [%[crem]] \n"
+ " tbnz %[w1], 63, rty%= \n"
+ "dne%=: \n"
+ : [w1] "=&r"(w1)
+ : [crem] "r"(&bkt->w1)
+ : "memory");
#else
- while (__atomic_load_n(&bkt->chunk_remainder,
- __ATOMIC_ACQUIRE) < 0)
+ while (__atomic_load_n((int64_t *)&bkt->w1, __ATOMIC_RELAXED) <
+ 0)
;
#endif
- /* Goto diff bucket. */
- tim_bkt_dec_lock(bkt);
goto __retry;
} else if (!rem) {
/* Only one thread can be here*/
@@ -388,18 +387,21 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring);
if (unlikely(chunk == NULL)) {
- tim_bkt_set_rem(bkt, 0);
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
tim->state = RTE_EVENT_TIMER_ERROR;
+ tim_bkt_set_rem(bkt, 0);
+ tim_bkt_dec_lock(bkt);
return -ENOMEM;
}
*chunk = *pent;
- while (tim_bkt_fetch_lock(lock_sema) !=
- (-tim_bkt_fetch_rem(lock_sema)))
- lock_sema = __atomic_load_n(&bkt->w1, __ATOMIC_ACQUIRE);
-
+ if (tim_bkt_fetch_lock(lock_sema)) {
+ do {
+ lock_sema = __atomic_load_n(&bkt->w1,
+ __ATOMIC_RELAXED);
+ } while (tim_bkt_fetch_lock(lock_sema) - 1);
+ rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
+ }
mirr_bkt->current_chunk = (uintptr_t)chunk;
__atomic_store_n(&bkt->chunk_remainder,
tim_ring->nb_chunk_slots - 1, __ATOMIC_RELEASE);
@@ -409,12 +411,11 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring,
*chunk = *pent;
}
- /* Copy work entry. */
- tim_bkt_inc_nent(bkt);
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = (uintptr_t)chunk;
tim->impl_opaque[1] = (uintptr_t)bkt;
- tim->state = RTE_EVENT_TIMER_ARMED;
+ __atomic_store_n(&tim->state, RTE_EVENT_TIMER_ARMED, __ATOMIC_RELEASE);
+ tim_bkt_inc_nent(bkt);
+ tim_bkt_dec_lock_relaxed(bkt);
return 0;
}
@@ -463,6 +464,23 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
if (lock_cnt) {
tim_bkt_dec_lock(bkt);
+#ifdef RTE_ARCH_ARM64
+ asm volatile(" ldxrb %w[lock_cnt], [%[lock]] \n"
+ " tst %w[lock_cnt], 255 \n"
+ " beq dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxrb %w[lock_cnt], [%[lock]] \n"
+ " tst %w[lock_cnt], 255 \n"
+ " bne rty%= \n"
+ "dne%=: \n"
+ : [lock_cnt] "=&r"(lock_cnt)
+ : [lock] "r"(&bkt->lock)
+ : "memory");
+#else
+ while (__atomic_load_n(&bkt->lock, __ATOMIC_RELAXED))
+ ;
+#endif
goto __retry;
}
@@ -471,22 +489,20 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring,
if (tim_bkt_get_nent(lock_sema) != 0) {
uint64_t hbt_state;
#ifdef RTE_ARCH_ARM64
- asm volatile(
- " ldaxr %[hbt], [%[w1]] \n"
- " tbz %[hbt], 33, dne%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldaxr %[hbt], [%[w1]] \n"
- " tbnz %[hbt], 33, rty%= \n"
- "dne%=: \n"
- : [hbt] "=&r" (hbt_state)
- : [w1] "r" ((&bkt->w1))
- : "memory"
- );
+ asm volatile(" ldxr %[hbt], [%[w1]] \n"
+ " tbz %[hbt], 33, dne%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldxr %[hbt], [%[w1]] \n"
+ " tbnz %[hbt], 33, rty%= \n"
+ "dne%=: \n"
+ : [hbt] "=&r"(hbt_state)
+ : [w1] "r"((&bkt->w1))
+ : "memory");
#else
do {
hbt_state = __atomic_load_n(&bkt->w1,
- __ATOMIC_ACQUIRE);
+ __ATOMIC_RELAXED);
} while (hbt_state & BIT_ULL(33));
#endif
@@ -563,19 +579,18 @@ tim_rm_entry(struct rte_event_timer *tim)
bkt = (struct otx2_tim_bkt *)(uintptr_t)tim->impl_opaque[1];
lock_sema = tim_bkt_inc_lock(bkt);
if (tim_bkt_get_hbt(lock_sema) || !tim_bkt_get_nent(lock_sema)) {
- tim_bkt_dec_lock(bkt);
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
+ tim_bkt_dec_lock(bkt);
return -ENOENT;
}
entry->w0 = 0;
entry->wqe = 0;
- tim_bkt_dec_lock(bkt);
-
tim->state = RTE_EVENT_TIMER_CANCELED;
tim->impl_opaque[0] = 0;
tim->impl_opaque[1] = 0;
+ tim_bkt_dec_lock(bkt);
return 0;
}
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v3 3/4] event/octeontx2: reduce chunk pool memory usage
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
@ 2021-03-23 8:44 ` pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
2021-03-24 7:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation Jerin Jacob
3 siblings, 0 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-23 8:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reduce amount of memory used by chunk pool when the mempool used
is OCTEONTX2 NPA.
Previously, the number of chunks configured when NPA is used is
equal to the number of timers requested plus the number of buckets
and if the max timeout is long enough w.r.t. resolution requested
there will a large number of buckets which would cause high memory
usage.
Reduce the number of chunks when NPA is used to the number of timers
requested as buckets that are processed chunk lists are automatically
freed.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
v2 Changes:
- Describe memory allocation changes.
drivers/event/octeontx2/otx2_tim_evdev.c | 19 ++++++++++---------
drivers/event/octeontx2/otx2_tim_evdev.h | 2 +-
2 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index d1e967eb7..4fb002ddb 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -91,6 +91,8 @@ tim_chnk_pool_create(struct otx2_tim_ring *tim_ring,
if (cache_sz > RTE_MEMPOOL_CACHE_MAX_SIZE)
cache_sz = RTE_MEMPOOL_CACHE_MAX_SIZE;
+ cache_sz = cache_sz != 0 ? cache_sz : 2;
+ tim_ring->nb_chunks += (cache_sz * rte_lcore_count());
if (!tim_ring->disable_npa) {
tim_ring->chunk_pool = rte_mempool_create_empty(pool_name,
tim_ring->nb_chunks, tim_ring->chunk_sz,
@@ -268,16 +270,15 @@ otx2_tim_ring_create(struct rte_event_timer_adapter *adptr)
}
}
- tim_ring->nb_chunks = tim_ring->nb_timers / OTX2_TIM_NB_CHUNK_SLOTS(
- tim_ring->chunk_sz);
- tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
-
- if (tim_ring->disable_npa)
+ if (tim_ring->disable_npa) {
+ tim_ring->nb_chunks =
+ tim_ring->nb_timers /
+ OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
tim_ring->nb_chunks = tim_ring->nb_chunks * tim_ring->nb_bkts;
- else
- tim_ring->nb_chunks = tim_ring->nb_chunks + tim_ring->nb_bkts;
-
- /* Create buckets. */
+ } else {
+ tim_ring->nb_chunks = tim_ring->nb_timers;
+ }
+ tim_ring->nb_chunk_slots = OTX2_TIM_NB_CHUNK_SLOTS(tim_ring->chunk_sz);
tim_ring->bkt = rte_zmalloc("otx2_tim_bucket", (tim_ring->nb_bkts) *
sizeof(struct otx2_tim_bkt),
RTE_CACHE_LINE_SIZE);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index bf89b85b0..0667d4576 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -70,7 +70,7 @@
#define OTX2_TIM_MAX_BURST (RTE_CACHE_LINE_SIZE / \
OTX2_TIM_CHUNK_ALIGNMENT)
#define OTX2_TIM_NB_CHUNK_SLOTS(sz) (((sz) / OTX2_TIM_CHUNK_ALIGNMENT) - 1)
-#define OTX2_TIM_MIN_CHUNK_SLOTS (0x1)
+#define OTX2_TIM_MIN_CHUNK_SLOTS (0x8)
#define OTX2_TIM_MAX_CHUNK_SLOTS (0x1FFE)
#define OTX2_TIM_MIN_TMO_TKS (256)
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [dpdk-dev] [PATCH v3 4/4] event/octeontx2: timer always use virtual counter
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
@ 2021-03-23 8:44 ` pbhagavatula
2021-03-24 7:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation Jerin Jacob
3 siblings, 0 replies; 17+ messages in thread
From: pbhagavatula @ 2021-03-23 8:44 UTC (permalink / raw)
To: jerinj, Pavan Nikhilesh; +Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use virtual counter for estimating current bucket as PMU cannot be
reliably used to estimate time.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
v2 Changes:
- Use __rte_arm64_cntvct and __rte_arm64_cntfrq instead of reimplementing
asm.
drivers/event/octeontx2/otx2_tim_evdev.c | 19 +++--------------
drivers/event/octeontx2/otx2_tim_evdev.h | 26 +++++++++++++++++++++++
drivers/event/octeontx2/otx2_tim_worker.c | 4 ++--
drivers/event/octeontx2/otx2_tim_worker.h | 2 +-
4 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.c b/drivers/event/octeontx2/otx2_tim_evdev.c
index 4fb002ddb..926c2dce6 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.c
+++ b/drivers/event/octeontx2/otx2_tim_evdev.c
@@ -354,7 +354,7 @@ otx2_tim_calibrate_start_tsc(struct otx2_tim_ring *tim_ring)
for (icount = 0; icount < OTX2_TIM_CALIB_ITER; icount++) {
real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
- bkt_cyc = rte_rdtsc();
+ bkt_cyc = tim_cntvct();
bucket = (bkt_cyc - tim_ring->ring_start_cyc) /
tim_ring->tck_int;
bucket = bucket % (tim_ring->nb_bkts);
@@ -389,20 +389,8 @@ otx2_tim_ring_start(const struct rte_event_timer_adapter *adptr)
tim_err_desc(rc);
goto fail;
}
-#ifdef RTE_ARM_EAL_RDTSC_USE_PMU
- uint64_t tenns_stmp, tenns_diff;
- uint64_t pmu_stmp;
-
- pmu_stmp = rte_rdtsc();
- asm volatile("mrs %0, cntvct_el0" : "=r" (tenns_stmp));
-
- tenns_diff = tenns_stmp - rsp->timestarted;
- pmu_stmp = pmu_stmp - (NSEC2TICK(tenns_diff * 10, rte_get_timer_hz()));
- tim_ring->ring_start_cyc = pmu_stmp;
-#else
tim_ring->ring_start_cyc = rsp->timestarted;
-#endif
- tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, rte_get_timer_hz());
+ tim_ring->tck_int = NSEC2TICK(tim_ring->tck_nsec, tim_cntfrq());
tim_ring->tot_int = tim_ring->tck_int * tim_ring->nb_bkts;
tim_ring->fast_div = rte_reciprocal_value_u64(tim_ring->tck_int);
tim_ring->fast_bkt = rte_reciprocal_value_u64(tim_ring->nb_bkts);
@@ -470,8 +458,7 @@ otx2_tim_stats_get(const struct rte_event_timer_adapter *adapter,
struct rte_event_timer_adapter_stats *stats)
{
struct otx2_tim_ring *tim_ring = adapter->data->adapter_priv;
- uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
-
+ uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
stats->evtim_exp_count = __atomic_load_n(&tim_ring->arm_cnt,
__ATOMIC_RELAXED);
diff --git a/drivers/event/octeontx2/otx2_tim_evdev.h b/drivers/event/octeontx2/otx2_tim_evdev.h
index 0667d4576..410880e14 100644
--- a/drivers/event/octeontx2/otx2_tim_evdev.h
+++ b/drivers/event/octeontx2/otx2_tim_evdev.h
@@ -176,6 +176,32 @@ tim_priv_get(void)
return mz->addr;
}
+#ifdef RTE_ARCH_ARM64
+static inline uint64_t
+tim_cntvct(void)
+{
+ return __rte_arm64_cntvct();
+}
+
+static inline uint64_t
+tim_cntfrq(void)
+{
+ return __rte_arm64_cntfrq();
+}
+#else
+static inline uint64_t
+tim_cntvct(void)
+{
+ return 0;
+}
+
+static inline uint64_t
+tim_cntfrq(void)
+{
+ return 0;
+}
+#endif
+
#define TIM_ARM_FASTPATH_MODES \
FP(sp, 0, 0, 0, OTX2_TIM_ENA_DFB | OTX2_TIM_SP) \
FP(mp, 0, 0, 1, OTX2_TIM_ENA_DFB | OTX2_TIM_MP) \
diff --git a/drivers/event/octeontx2/otx2_tim_worker.c b/drivers/event/octeontx2/otx2_tim_worker.c
index 6a3511ec0..9ee07958f 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.c
+++ b/drivers/event/octeontx2/otx2_tim_worker.c
@@ -41,12 +41,12 @@ tim_format_event(const struct rte_event_timer * const tim,
static inline void
tim_sync_start_cyc(struct otx2_tim_ring *tim_ring)
{
- uint64_t cur_cyc = rte_rdtsc();
+ uint64_t cur_cyc = tim_cntvct();
uint32_t real_bkt;
if (cur_cyc - tim_ring->last_updt_cyc > tim_ring->tot_int) {
real_bkt = otx2_read64(tim_ring->base + TIM_LF_RING_REL) >> 44;
- cur_cyc = rte_rdtsc();
+ cur_cyc = tim_cntvct();
tim_ring->ring_start_cyc = cur_cyc -
(real_bkt * tim_ring->tck_int);
diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h
index 5ece8fd05..efe88a869 100644
--- a/drivers/event/octeontx2/otx2_tim_worker.h
+++ b/drivers/event/octeontx2/otx2_tim_worker.h
@@ -132,7 +132,7 @@ tim_get_target_bucket(struct otx2_tim_ring *const tim_ring,
const uint32_t rel_bkt, struct otx2_tim_bkt **bkt,
struct otx2_tim_bkt **mirr_bkt)
{
- const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc;
+ const uint64_t bkt_cyc = tim_cntvct() - tim_ring->ring_start_cyc;
uint64_t bucket =
rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) +
rel_bkt;
--
2.17.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
` (2 preceding siblings ...)
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
@ 2021-03-24 7:44 ` Jerin Jacob
3 siblings, 0 replies; 17+ messages in thread
From: Jerin Jacob @ 2021-03-24 7:44 UTC (permalink / raw)
To: Pavan Nikhilesh; +Cc: Jerin Jacob, dpdk-dev
On Tue, Mar 23, 2021 at 2:14 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Simplify timer bucket estimation we need not align buckets to
> power of 2 instead use reciprocal division to compute mod.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
Series applied to dpdk-next-eventdev/for-main. Thanks.
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2021-03-24 7:45 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-25 12:23 [dpdk-dev] [PATCH 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
2021-02-25 12:23 ` [dpdk-dev] [PATCH 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
2021-03-20 13:30 ` Jerin Jacob
2021-02-25 12:23 ` [dpdk-dev] [PATCH 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
2021-03-20 13:34 ` Jerin Jacob
2021-03-21 7:11 ` [dpdk-dev] [EXT] " Pavan Nikhilesh Bhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
2021-03-21 8:49 ` [dpdk-dev] [PATCH v2 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
2021-03-22 16:13 ` Jerin Jacob
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 2/4] event/octeontx2: optimize timer arm routine pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 3/4] event/octeontx2: reduce chunk pool memory usage pbhagavatula
2021-03-23 8:44 ` [dpdk-dev] [PATCH v3 4/4] event/octeontx2: timer always use virtual counter pbhagavatula
2021-03-24 7:44 ` [dpdk-dev] [PATCH v3 1/4] event/octeontx2: simplify timer bucket estimation Jerin Jacob
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).