[PATCH 1/3] event/cnxk: use LMTST for enqueue new burst

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst
@ 2023-04-19 20:01 pbhagavatula
  2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-19 20:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use LMTST when all events in the burst are enqueue with
rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
with the `rte_event_enqueue_new_burst` API.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/common/cnxk/hw/sso.h        |   1 +
 drivers/common/cnxk/roc_sso.c       |  10 +-
 drivers/common/cnxk/roc_sso.h       |   3 +
 drivers/event/cnxk/cn10k_eventdev.c |   9 +-
 drivers/event/cnxk/cn10k_eventdev.h |   6 +-
 drivers/event/cnxk/cn10k_worker.c   | 304 +++++++++++++++++++++++++++-
 drivers/event/cnxk/cn9k_eventdev.c  |   2 +-
 drivers/event/cnxk/cnxk_eventdev.c  |  15 +-
 drivers/event/cnxk/cnxk_eventdev.h  |   4 +-
 9 files changed, 338 insertions(+), 16 deletions(-)

diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
index 25deaa4c14..09b8d4955f 100644
--- a/drivers/common/cnxk/hw/sso.h
+++ b/drivers/common/cnxk/hw/sso.h
@@ -157,6 +157,7 @@
 #define SSO_LF_GGRP_AQ_CNT	 (0x1c0ull)
 #define SSO_LF_GGRP_AQ_THR	 (0x1e0ull)
 #define SSO_LF_GGRP_MISC_CNT	 (0x200ull)
+#define SSO_LF_GGRP_OP_AW_LMTST	 (0x400ull)
 
 #define SSO_AF_IAQ_FREE_CNT_MASK      0x3FFFull
 #define SSO_AF_IAQ_RSVD_FREE_MASK     0x3FFFull
diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
index 4a6a5080f7..78e98bbdd6 100644
--- a/drivers/common/cnxk/roc_sso.c
+++ b/drivers/common/cnxk/roc_sso.c
@@ -6,6 +6,7 @@
 #include "roc_priv.h"
 
 #define SSO_XAQ_CACHE_CNT (0x7)
+#define SSO_XAQ_SLACK	  (16)
 
 /* Private functions. */
 int
@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
 
 	xaq->nb_xae = nb_xae;
 
-	/* Taken from HRM 14.3.3(4) */
+	/** SSO will reserve upto 0x4 XAQ buffers per group when GetWork engine
+	 * is inactive and it might prefetch an additional 0x3 buffers due to
+	 * pipelining.
+	 */
 	xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
 	xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
+	xaq->nb_xaq += SSO_XAQ_SLACK;
 
 	xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
 	if (xaq->mem == NULL) {
@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
 	 * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
 	 * to request XAQ to cache them even before enqueue is called.
 	 */
-	xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
+	xaq->xaq_lmt =
+		xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;
 
 	return 0;
 npa_fill_fail:
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index e67797b046..a2bb6fcb22 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -7,6 +7,9 @@
 
 #include "hw/ssow.h"
 
+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
+#define ROC_SSO_XAE_PER_XAQ	     352
+
 struct roc_sso_hwgrp_qos {
 	uint16_t hwgrp;
 	uint8_t xaq_prcnt;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 071ea5a212..855c92da83 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	uint64_t val;
 
 	ws->grp_base = grp_base;
-	ws->fc_mem = (uint64_t *)dev->fc_iova;
+	ws->fc_mem = (int64_t *)dev->fc_iova;
 	ws->xaq_lmt = dev->xaq_lmt;
+	ws->fc_cache_space = dev->fc_cache_space;
+	ws->aw_lmt = ws->lmt_base;
 
 	/* Set get_work timeout for HWS */
 	val = NSEC2USEC(dev->deq_tmo_ns);
@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
 
 	dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
 	cnxk_sso_info_get(dev, dev_info);
+	dev_info->max_event_port_enqueue_depth = UINT32_MAX;
 }
 
 static int
@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	int rc;
 
-	rc = cnxk_sso_dev_validate(event_dev);
+	rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
 	if (rc < 0) {
 		plt_err("Invalid event device configuration");
 		return -EINVAL;
@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
 	for (i = 0; i < dev->nb_event_ports; i++) {
 		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
 		ws->xaq_lmt = dev->xaq_lmt;
-		ws->fc_mem = (uint64_t *)dev->fc_iova;
+		ws->fc_mem = (int64_t *)dev->fc_iova;
 		ws->tstamp = dev->tstamp;
 		if (lookup_mem)
 			ws->lookup_mem = lookup_mem;
diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
index aaa01d1ec1..29567728cd 100644
--- a/drivers/event/cnxk/cn10k_eventdev.h
+++ b/drivers/event/cnxk/cn10k_eventdev.h
@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
 	struct cnxk_timesync_info **tstamp;
 	uint64_t meta_aura;
 	/* Add Work Fastpath data */
-	uint64_t xaq_lmt __rte_cache_aligned;
-	uint64_t *fc_mem;
+	int64_t *fc_mem __rte_cache_aligned;
+	int64_t *fc_cache_space;
+	uintptr_t aw_lmt;
 	uintptr_t grp_base;
+	int32_t xaq_lmt;
 	/* Tx Fastpath data */
 	uintptr_t lmt_base __rte_cache_aligned;
 	uint64_t lso_tun_fmt;
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 562d2fca13..ba3ea4bb35 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 		cn10k_sso_hws_fwd_group(ws, ev, grp);
 }
 
+static inline int32_t
+sso_read_xaq_space(struct cn10k_sso_hws *ws)
+{
+	return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
+	       ROC_SSO_XAE_PER_XAQ;
+}
+
+static inline void
+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
+{
+	int64_t cached, refill;
+
+retry:
+	while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
+		;
+
+	cached = __atomic_sub_fetch(ws->fc_cache_space, req, __ATOMIC_ACQUIRE);
+	/* Check if there is enough space, else update and retry. */
+	if (cached < 0) {
+		/* Check if we have space else retry. */
+		do {
+			refill = sso_read_xaq_space(ws);
+		} while (refill <= 0);
+		__atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
+					  0, __ATOMIC_RELEASE,
+					  __ATOMIC_RELAXED);
+		goto retry;
+	}
+}
+
 uint16_t __rte_hot
 cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 {
@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 	return 1;
 }
 
+#define VECTOR_SIZE_BITS	     0xFFFFFFFFFFF80000ULL
+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
+
+static uint64_t
+vector_size_partial_mask(uint16_t off, uint16_t cnt)
+{
+	return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
+	       ((uint64_t)(cnt - 1) << off);
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
+			      const struct rte_event ev[], uint16_t n)
+{
+	uint16_t lines, partial_line, burst, left;
+	uint64_t wdata[2], pa[2] = {0};
+	uintptr_t lmt_addr;
+	uint16_t sz0, sz1;
+	uint16_t lmt_id;
+
+	sz0 = sz1 = 0;
+	lmt_addr = ws->lmt_base;
+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+	left = n;
+again:
+	burst = RTE_MIN(
+		BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
+		left);
+
+	/* Set wdata */
+	lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
+	partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
+	wdata[0] = wdata[1] = 0;
+	if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
+		wdata[0] = lmt_id;
+		wdata[0] |= 15ULL << 12;
+		wdata[0] |= VECTOR_SIZE_BITS;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+		sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+
+		wdata[1] = lmt_id + 16;
+		pa[1] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+
+		lines -= 17;
+		wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+						 (uint64_t)(lines << 12);
+		wdata[1] |= partial_line ?
+				    vector_size_partial_mask(
+					    VECTOR_GET_LINE_OFFSET(lines),
+					    partial_line) :
+				    VECTOR_SIZE_BITS;
+		sz1 = burst - sz0;
+		partial_line = 0;
+	} else if (lines) {
+		/* We need to handle two cases here:
+		 * 1. Partial line spill over to wdata[1] i.e. lines == 16
+		 * 2. Partial line with spill lines < 16.
+		 */
+		wdata[0] = lmt_id;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+		sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+		if (lines == 16) {
+			wdata[0] |= 15ULL << 12;
+			wdata[0] |= VECTOR_SIZE_BITS;
+			if (partial_line) {
+				wdata[1] = lmt_id + 16;
+				pa[1] = (ws->grp_base + (queue_id << 12) +
+					 SSO_LF_GGRP_OP_AW_LMTST) |
+					((partial_line - 1) << 4);
+			}
+		} else {
+			lines -= 1;
+			wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+							 (uint64_t)(lines << 12);
+			wdata[0] |=
+				partial_line ?
+					vector_size_partial_mask(
+						VECTOR_GET_LINE_OFFSET(lines),
+						partial_line) :
+					VECTOR_SIZE_BITS;
+			sz0 += partial_line;
+		}
+		sz1 = burst - sz0;
+		partial_line = 0;
+	}
+
+	/* Only partial lines */
+	if (partial_line) {
+		wdata[0] = lmt_id;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			((partial_line - 1) << 4);
+		sz0 = partial_line;
+		sz1 = burst - sz0;
+	}
+
+#if defined(RTE_ARCH_ARM64)
+	uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
+	uint64x2_t tt_mask = {0x300000000ULL, 0};
+	uint16_t parts;
+
+	while (burst) {
+		parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
+		burst -= parts;
+		/* Lets try to fill atleast one line per burst. */
+		switch (parts) {
+		case 8: {
+			uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
+
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+					aw_mask);
+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+					aw_mask);
+			aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
+					aw_mask);
+			aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
+					aw_mask);
+			aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
+					aw_mask);
+			aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+					aw2);
+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+					aw3);
+			aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
+					aw4);
+			aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
+					aw5);
+			aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
+					aw6);
+			aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
+					aw7);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
+		} break;
+		case 4: {
+			uint64x2_t aw0, aw1, aw2, aw3;
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+					aw_mask);
+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+					aw2);
+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+					aw3);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
+		} break;
+		case 2: {
+			uint64x2_t aw0, aw1;
+
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
+		} break;
+		case 1: {
+			__uint128_t aw0;
+
+			aw0 = ev[0].u64;
+			aw0 <<= 64;
+			aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+			aw0 |= (uint64_t)ev[0].sched_type << 32;
+
+			*((__uint128_t *)lmt_addr) = aw0;
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+		} break;
+		}
+		ev += parts;
+	}
+#else
+	uint16_t i;
+
+	for (i = 0; i < burst; i++) {
+		__uint128_t aw0;
+
+		aw0 = ev[0].u64;
+		aw0 <<= 64;
+		aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+		aw0 |= (uint64_t)ev[0].sched_type << 32;
+		*((__uint128_t *)lmt_addr) = aw0;
+		lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+	}
+#endif
+
+	/* wdata[0] will be always valid */
+	sso_lmt_aw_wait_fc(ws, sz0);
+	roc_lmt_submit_steorl(wdata[0], pa[0]);
+	if (wdata[1]) {
+		sso_lmt_aw_wait_fc(ws, sz1);
+		roc_lmt_submit_steorl(wdata[1], pa[1]);
+	}
+
+	left -= (sz0 + sz1);
+	if (left)
+		goto again;
+
+	return n;
+}
+
 uint16_t __rte_hot
 cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
 			uint16_t nb_events)
@@ -115,13 +392,32 @@ uint16_t __rte_hot
 cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
 			    uint16_t nb_events)
 {
+	uint16_t idx = 0, done = 0, rc = 0;
 	struct cn10k_sso_hws *ws = port;
-	uint16_t i, rc = 1;
+	uint8_t queue_id;
+	int32_t space;
+
+	/* Do a common back-pressure check and return */
+	space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
+	if (space <= 0)
+		return 0;
+	nb_events = space < nb_events ? space : nb_events;
+
+	do {
+		queue_id = ev[idx].queue_id;
+		for (idx = idx + 1; idx < nb_events; idx++)
+			if (queue_id != ev[idx].queue_id)
+				break;
+
+		rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
+						   idx - done);
+		if (rc != (idx - done))
+			return rc + done;
+		done += rc;
 
-	for (i = 0; i < nb_events && rc; i++)
-		rc = cn10k_sso_hws_new_event(ws, &ev[i]);
+	} while (done < nb_events);
 
-	return nb_events;
+	return done;
 }
 
 uint16_t __rte_hot
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7e8339bd3a..e59e537311 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	int rc;
 
-	rc = cnxk_sso_dev_validate(event_dev);
+	rc = cnxk_sso_dev_validate(event_dev, 1, 1);
 	if (rc < 0) {
 		plt_err("Invalid event device configuration");
 		return -EINVAL;
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index cb9ba5d353..99f9cdcd0d 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
 }
 
 int
-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
+		      uint32_t enq_depth)
 {
 	struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
 		return -EINVAL;
 	}
 
-	if (conf->nb_event_port_dequeue_depth > 1) {
+	if (conf->nb_event_port_dequeue_depth > deq_depth) {
 		plt_err("Unsupported event port deq depth requested");
 		return -EINVAL;
 	}
 
-	if (conf->nb_event_port_enqueue_depth > 1) {
+	if (conf->nb_event_port_enqueue_depth > enq_depth) {
 		plt_err("Unsupported event port enq depth requested");
 		return -EINVAL;
 	}
@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
 	}
 
 	dev = cnxk_sso_pmd_priv(event_dev);
+	dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
+					  PLT_CACHE_LINE_SIZE);
+	if (dev->fc_cache_space == NULL) {
+		plt_memzone_free(mz);
+		plt_err("Failed to reserve memory for XAQ fc cache");
+		return -ENOMEM;
+	}
+
 	pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
 	dev->sso.pci_dev = pci_dev;
 
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index c7cbd722ab..a2f30bfe5f 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
 	uint32_t max_dequeue_timeout_ns;
 	int32_t max_num_events;
 	uint64_t xaq_lmt;
+	int64_t *fc_cache_space;
 	rte_iova_t fc_iova;
 	uint64_t rx_offloads;
 	uint64_t tx_offloads;
@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
 int cnxk_sso_remove(struct rte_pci_device *pci_dev);
 void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
 		       struct rte_event_dev_info *dev_info);
-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
+			  uint32_t deq_depth, uint32_t enq_depth);
 int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
 			   cnxk_sso_init_hws_mem_t init_hws_mem,
 			   cnxk_sso_hws_setup_t hws_setup);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/3] app/eventdev: use enqueue new event burst routine
  2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
@ 2023-04-19 20:01 ` pbhagavatula
  2023-04-19 20:01 ` [PATCH 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
  2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
  2 siblings, 0 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-19 20:01 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use the `rte_event_enqueue_new_burst` routine to enqueue events
with rte_event::op as RTE_EVENT_OP_NEW. This allows PMDs to use
optimized enqueue routines.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test-eventdev/evt_options.c      |  2 +-
 app/test-eventdev/test_perf_common.c | 58 +++++++++++++++++-----------
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/app/test-eventdev/evt_options.c b/app/test-eventdev/evt_options.c
index b175c067cd..03fb3bfce0 100644
--- a/app/test-eventdev/evt_options.c
+++ b/app/test-eventdev/evt_options.c
@@ -27,7 +27,7 @@ evt_options_default(struct evt_options *opt)
 	opt->nb_flows = 1024;
 	opt->socket_id = SOCKET_ID_ANY;
 	opt->pool_sz = 16 * 1024;
-	opt->prod_enq_burst_sz = 1;
+	opt->prod_enq_burst_sz = 0;
 	opt->wkr_deq_dep = 16;
 	opt->nb_pkts = (1ULL << 26); /* do ~64M packets */
 	opt->nb_timers = 1E8;
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index fd434666cb..68af3cb346 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -131,8 +131,10 @@ perf_producer(void *arg)
 	uint32_t flow_counter = 0;
 	uint64_t count = 0;
 	struct perf_elt *m[BURST_SIZE + 1] = {NULL};
+	uint8_t enable_fwd_latency;
 	struct rte_event ev;
 
+	enable_fwd_latency = opt->fwd_latency;
 	if (opt->verbose_level > 1)
 		printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
 				rte_lcore_id(), dev_id, port, p->queue_id);
@@ -151,13 +153,16 @@ perf_producer(void *arg)
 		for (i = 0; i < BURST_SIZE; i++) {
 			ev.flow_id = flow_counter++ % nb_flows;
 			ev.event_ptr = m[i];
-			m[i]->timestamp = rte_get_timer_cycles();
-			while (rte_event_enqueue_burst(dev_id,
-						       port, &ev, 1) != 1) {
+			if (enable_fwd_latency)
+				m[i]->timestamp = rte_get_timer_cycles();
+			while (rte_event_enqueue_new_burst(dev_id, port, &ev,
+							   1) != 1) {
 				if (t->done)
 					break;
 				rte_pause();
-				m[i]->timestamp = rte_get_timer_cycles();
+				if (enable_fwd_latency)
+					m[i]->timestamp =
+						rte_get_timer_cycles();
 			}
 		}
 		count += BURST_SIZE;
@@ -171,7 +176,6 @@ perf_producer_burst(void *arg)
 {
 	uint32_t i;
 	uint64_t timestamp;
-	struct rte_event_dev_info dev_info;
 	struct prod_data *p  = arg;
 	struct test_perf *t = p->t;
 	struct evt_options *opt = t->opt;
@@ -183,15 +187,13 @@ perf_producer_burst(void *arg)
 	uint32_t flow_counter = 0;
 	uint16_t enq = 0;
 	uint64_t count = 0;
-	struct perf_elt *m[MAX_PROD_ENQ_BURST_SIZE + 1];
-	struct rte_event ev[MAX_PROD_ENQ_BURST_SIZE + 1];
+	struct perf_elt *m[opt->prod_enq_burst_sz + 1];
+	struct rte_event ev[opt->prod_enq_burst_sz + 1];
 	uint32_t burst_size = opt->prod_enq_burst_sz;
+	uint8_t enable_fwd_latency;
 
-	memset(m, 0, sizeof(*m) * (MAX_PROD_ENQ_BURST_SIZE + 1));
-	rte_event_dev_info_get(dev_id, &dev_info);
-	if (dev_info.max_event_port_enqueue_depth < burst_size)
-		burst_size = dev_info.max_event_port_enqueue_depth;
-
+	enable_fwd_latency = opt->fwd_latency;
+	memset(m, 0, sizeof(*m) * (opt->prod_enq_burst_sz + 1));
 	if (opt->verbose_level > 1)
 		printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
 				rte_lcore_id(), dev_id, port, p->queue_id);
@@ -212,19 +214,21 @@ perf_producer_burst(void *arg)
 		for (i = 0; i < burst_size; i++) {
 			ev[i].flow_id = flow_counter++ % nb_flows;
 			ev[i].event_ptr = m[i];
-			m[i]->timestamp = timestamp;
+			if (enable_fwd_latency)
+				m[i]->timestamp = timestamp;
 		}
-		enq = rte_event_enqueue_burst(dev_id, port, ev, burst_size);
+		enq = rte_event_enqueue_new_burst(dev_id, port, ev, burst_size);
 		while (enq < burst_size) {
-			enq += rte_event_enqueue_burst(dev_id, port,
-							ev + enq,
-							burst_size - enq);
+			enq += rte_event_enqueue_new_burst(
+				dev_id, port, ev + enq, burst_size - enq);
 			if (t->done)
 				break;
 			rte_pause();
-			timestamp = rte_get_timer_cycles();
-			for (i = enq; i < burst_size; i++)
-				m[i]->timestamp = timestamp;
+			if (enable_fwd_latency) {
+				timestamp = rte_get_timer_cycles();
+				for (i = enq; i < burst_size; i++)
+					m[i]->timestamp = timestamp;
+			}
 		}
 		count += burst_size;
 	}
@@ -799,9 +803,19 @@ perf_event_crypto_producer_burst(void *arg)
 static int
 perf_producer_wrapper(void *arg)
 {
+	struct rte_event_dev_info dev_info;
 	struct prod_data *p  = arg;
 	struct test_perf *t = p->t;
-	bool burst = evt_has_burst_mode(p->dev_id);
+
+	rte_event_dev_info_get(p->dev_id, &dev_info);
+	if (!t->opt->prod_enq_burst_sz) {
+		t->opt->prod_enq_burst_sz = MAX_PROD_ENQ_BURST_SIZE;
+		if (dev_info.max_event_port_enqueue_depth > 0 &&
+		    (uint32_t)dev_info.max_event_port_enqueue_depth <
+			    t->opt->prod_enq_burst_sz)
+			t->opt->prod_enq_burst_sz =
+				dev_info.max_event_port_enqueue_depth;
+	}
 
 	/* In case of synthetic producer, launch perf_producer or
 	 * perf_producer_burst depending on producer enqueue burst size
@@ -811,7 +825,7 @@ perf_producer_wrapper(void *arg)
 		return perf_producer(arg);
 	else if (t->opt->prod_type == EVT_PROD_TYPE_SYNT &&
 			t->opt->prod_enq_burst_sz > 1) {
-		if (!burst)
+		if (dev_info.max_event_port_enqueue_depth == 1)
 			evt_err("This event device does not support burst mode");
 		else
 			return perf_producer_burst(arg);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/3] app/eventdev: prevent mempool exhaustion
  2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
  2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
@ 2023-04-19 20:01 ` pbhagavatula
  2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
  2 siblings, 0 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-19 20:01 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Prevent mempool exhaustion due to elements being stuck in lcore
local caches.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test-eventdev/test_perf_common.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index 68af3cb346..5e0255cfeb 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -1859,34 +1859,35 @@ int
 perf_mempool_setup(struct evt_test *test, struct evt_options *opt)
 {
 	struct test_perf *t = evt_test_priv(test);
+	unsigned int cache_sz;
 
+	cache_sz = RTE_MIN(RTE_MEMPOOL_CACHE_MAX_SIZE, (opt->pool_sz / 1.5) / t->nb_workers);
 	if (opt->prod_type == EVT_PROD_TYPE_SYNT ||
 			opt->prod_type == EVT_PROD_TYPE_EVENT_TIMER_ADPTR) {
 		t->pool = rte_mempool_create(test->name, /* mempool name */
 				opt->pool_sz, /* number of elements*/
 				sizeof(struct perf_elt), /* element size*/
-				512, /* cache size*/
+				cache_sz, /* cache size*/
 				0, NULL, NULL,
 				perf_elt_init, /* obj constructor */
 				NULL, opt->socket_id, 0); /* flags */
 	} else if (opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR &&
-			opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC)  {
+		   opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
 		t->pool = rte_mempool_create(test->name, /* mempool name */
 				opt->pool_sz, /* number of elements*/
 				sizeof(struct perf_elt) + modex_test_case.result_len,
 				/* element size*/
-				512, /* cache size*/
+				cache_sz, /* cache size*/
 				0, NULL, NULL,
 				NULL, /* obj constructor */
 				NULL, opt->socket_id, 0); /* flags */
 	} else {
 		t->pool = rte_pktmbuf_pool_create(test->name, /* mempool name */
 				opt->pool_sz, /* number of elements*/
-				512, /* cache size*/
+				cache_sz, /* cache size*/
 				0,
 				RTE_MBUF_DEFAULT_BUF_SIZE,
 				opt->socket_id); /* flags */
-
 	}
 
 	if (t->pool == NULL) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst
  2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
  2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
  2023-04-19 20:01 ` [PATCH 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
@ 2023-04-25 19:51 ` pbhagavatula
  2023-04-25 19:51   ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
                     ` (3 more replies)
  2 siblings, 4 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-25 19:51 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use LMTST when all events in the burst are enqueue with
rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
with the `rte_event_enqueue_new_burst` API.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
v2 Changes:
- Fix spell check.

 drivers/common/cnxk/hw/sso.h        |   1 +
 drivers/common/cnxk/roc_sso.c       |  10 +-
 drivers/common/cnxk/roc_sso.h       |   3 +
 drivers/event/cnxk/cn10k_eventdev.c |   9 +-
 drivers/event/cnxk/cn10k_eventdev.h |   6 +-
 drivers/event/cnxk/cn10k_worker.c   | 304 +++++++++++++++++++++++++++-
 drivers/event/cnxk/cn9k_eventdev.c  |   2 +-
 drivers/event/cnxk/cnxk_eventdev.c  |  15 +-
 drivers/event/cnxk/cnxk_eventdev.h  |   4 +-
 9 files changed, 338 insertions(+), 16 deletions(-)

diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
index 25deaa4c14..09b8d4955f 100644
--- a/drivers/common/cnxk/hw/sso.h
+++ b/drivers/common/cnxk/hw/sso.h
@@ -157,6 +157,7 @@
 #define SSO_LF_GGRP_AQ_CNT	 (0x1c0ull)
 #define SSO_LF_GGRP_AQ_THR	 (0x1e0ull)
 #define SSO_LF_GGRP_MISC_CNT	 (0x200ull)
+#define SSO_LF_GGRP_OP_AW_LMTST	 (0x400ull)

 #define SSO_AF_IAQ_FREE_CNT_MASK      0x3FFFull
 #define SSO_AF_IAQ_RSVD_FREE_MASK     0x3FFFull
diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
index 4a6a5080f7..99a55e49b0 100644
--- a/drivers/common/cnxk/roc_sso.c
+++ b/drivers/common/cnxk/roc_sso.c
@@ -6,6 +6,7 @@
 #include "roc_priv.h"

 #define SSO_XAQ_CACHE_CNT (0x7)
+#define SSO_XAQ_SLACK	  (16)

 /* Private functions. */
 int
@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,

 	xaq->nb_xae = nb_xae;

-	/* Taken from HRM 14.3.3(4) */
+	/** SSO will reserve up to 0x4 XAQ buffers per group when GetWork engine
+	 * is inactive and it might prefetch an additional 0x3 buffers due to
+	 * pipelining.
+	 */
 	xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
 	xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
+	xaq->nb_xaq += SSO_XAQ_SLACK;

 	xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
 	if (xaq->mem == NULL) {
@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
 	 * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
 	 * to request XAQ to cache them even before enqueue is called.
 	 */
-	xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
+	xaq->xaq_lmt =
+		xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;

 	return 0;
 npa_fill_fail:
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index e67797b046..a2bb6fcb22 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -7,6 +7,9 @@

 #include "hw/ssow.h"

+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
+#define ROC_SSO_XAE_PER_XAQ	     352
+
 struct roc_sso_hwgrp_qos {
 	uint16_t hwgrp;
 	uint8_t xaq_prcnt;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 071ea5a212..855c92da83 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	uint64_t val;

 	ws->grp_base = grp_base;
-	ws->fc_mem = (uint64_t *)dev->fc_iova;
+	ws->fc_mem = (int64_t *)dev->fc_iova;
 	ws->xaq_lmt = dev->xaq_lmt;
+	ws->fc_cache_space = dev->fc_cache_space;
+	ws->aw_lmt = ws->lmt_base;

 	/* Set get_work timeout for HWS */
 	val = NSEC2USEC(dev->deq_tmo_ns);
@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,

 	dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
 	cnxk_sso_info_get(dev, dev_info);
+	dev_info->max_event_port_enqueue_depth = UINT32_MAX;
 }

 static int
@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	int rc;

-	rc = cnxk_sso_dev_validate(event_dev);
+	rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
 	if (rc < 0) {
 		plt_err("Invalid event device configuration");
 		return -EINVAL;
@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
 	for (i = 0; i < dev->nb_event_ports; i++) {
 		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
 		ws->xaq_lmt = dev->xaq_lmt;
-		ws->fc_mem = (uint64_t *)dev->fc_iova;
+		ws->fc_mem = (int64_t *)dev->fc_iova;
 		ws->tstamp = dev->tstamp;
 		if (lookup_mem)
 			ws->lookup_mem = lookup_mem;
diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
index aaa01d1ec1..29567728cd 100644
--- a/drivers/event/cnxk/cn10k_eventdev.h
+++ b/drivers/event/cnxk/cn10k_eventdev.h
@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
 	struct cnxk_timesync_info **tstamp;
 	uint64_t meta_aura;
 	/* Add Work Fastpath data */
-	uint64_t xaq_lmt __rte_cache_aligned;
-	uint64_t *fc_mem;
+	int64_t *fc_mem __rte_cache_aligned;
+	int64_t *fc_cache_space;
+	uintptr_t aw_lmt;
 	uintptr_t grp_base;
+	int32_t xaq_lmt;
 	/* Tx Fastpath data */
 	uintptr_t lmt_base __rte_cache_aligned;
 	uint64_t lso_tun_fmt;
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 562d2fca13..1028a12c64 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 		cn10k_sso_hws_fwd_group(ws, ev, grp);
 }

+static inline int32_t
+sso_read_xaq_space(struct cn10k_sso_hws *ws)
+{
+	return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
+	       ROC_SSO_XAE_PER_XAQ;
+}
+
+static inline void
+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
+{
+	int64_t cached, refill;
+
+retry:
+	while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
+		;
+
+	cached = __atomic_sub_fetch(ws->fc_cache_space, req, __ATOMIC_ACQUIRE);
+	/* Check if there is enough space, else update and retry. */
+	if (cached < 0) {
+		/* Check if we have space else retry. */
+		do {
+			refill = sso_read_xaq_space(ws);
+		} while (refill <= 0);
+		__atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
+					  0, __ATOMIC_RELEASE,
+					  __ATOMIC_RELAXED);
+		goto retry;
+	}
+}
+
 uint16_t __rte_hot
 cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 {
@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 	return 1;
 }

+#define VECTOR_SIZE_BITS	     0xFFFFFFFFFFF80000ULL
+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
+
+static uint64_t
+vector_size_partial_mask(uint16_t off, uint16_t cnt)
+{
+	return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
+	       ((uint64_t)(cnt - 1) << off);
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
+			      const struct rte_event ev[], uint16_t n)
+{
+	uint16_t lines, partial_line, burst, left;
+	uint64_t wdata[2], pa[2] = {0};
+	uintptr_t lmt_addr;
+	uint16_t sz0, sz1;
+	uint16_t lmt_id;
+
+	sz0 = sz1 = 0;
+	lmt_addr = ws->lmt_base;
+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+	left = n;
+again:
+	burst = RTE_MIN(
+		BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
+		left);
+
+	/* Set wdata */
+	lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
+	partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
+	wdata[0] = wdata[1] = 0;
+	if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
+		wdata[0] = lmt_id;
+		wdata[0] |= 15ULL << 12;
+		wdata[0] |= VECTOR_SIZE_BITS;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+		sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+
+		wdata[1] = lmt_id + 16;
+		pa[1] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+
+		lines -= 17;
+		wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+						 (uint64_t)(lines << 12);
+		wdata[1] |= partial_line ?
+				    vector_size_partial_mask(
+					    VECTOR_GET_LINE_OFFSET(lines),
+					    partial_line) :
+				    VECTOR_SIZE_BITS;
+		sz1 = burst - sz0;
+		partial_line = 0;
+	} else if (lines) {
+		/* We need to handle two cases here:
+		 * 1. Partial line spill over to wdata[1] i.e. lines == 16
+		 * 2. Partial line with spill lines < 16.
+		 */
+		wdata[0] = lmt_id;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+		sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+		if (lines == 16) {
+			wdata[0] |= 15ULL << 12;
+			wdata[0] |= VECTOR_SIZE_BITS;
+			if (partial_line) {
+				wdata[1] = lmt_id + 16;
+				pa[1] = (ws->grp_base + (queue_id << 12) +
+					 SSO_LF_GGRP_OP_AW_LMTST) |
+					((partial_line - 1) << 4);
+			}
+		} else {
+			lines -= 1;
+			wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+							 (uint64_t)(lines << 12);
+			wdata[0] |=
+				partial_line ?
+					vector_size_partial_mask(
+						VECTOR_GET_LINE_OFFSET(lines),
+						partial_line) :
+					VECTOR_SIZE_BITS;
+			sz0 += partial_line;
+		}
+		sz1 = burst - sz0;
+		partial_line = 0;
+	}
+
+	/* Only partial lines */
+	if (partial_line) {
+		wdata[0] = lmt_id;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			((partial_line - 1) << 4);
+		sz0 = partial_line;
+		sz1 = burst - sz0;
+	}
+
+#if defined(RTE_ARCH_ARM64)
+	uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
+	uint64x2_t tt_mask = {0x300000000ULL, 0};
+	uint16_t parts;
+
+	while (burst) {
+		parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
+		burst -= parts;
+		/* Lets try to fill at least one line per burst. */
+		switch (parts) {
+		case 8: {
+			uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
+
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+					aw_mask);
+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+					aw_mask);
+			aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
+					aw_mask);
+			aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
+					aw_mask);
+			aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
+					aw_mask);
+			aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+					aw2);
+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+					aw3);
+			aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
+					aw4);
+			aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
+					aw5);
+			aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
+					aw6);
+			aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
+					aw7);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
+		} break;
+		case 4: {
+			uint64x2_t aw0, aw1, aw2, aw3;
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+					aw_mask);
+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+					aw2);
+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+					aw3);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
+		} break;
+		case 2: {
+			uint64x2_t aw0, aw1;
+
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
+		} break;
+		case 1: {
+			__uint128_t aw0;
+
+			aw0 = ev[0].u64;
+			aw0 <<= 64;
+			aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+			aw0 |= (uint64_t)ev[0].sched_type << 32;
+
+			*((__uint128_t *)lmt_addr) = aw0;
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+		} break;
+		}
+		ev += parts;
+	}
+#else
+	uint16_t i;
+
+	for (i = 0; i < burst; i++) {
+		__uint128_t aw0;
+
+		aw0 = ev[0].u64;
+		aw0 <<= 64;
+		aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+		aw0 |= (uint64_t)ev[0].sched_type << 32;
+		*((__uint128_t *)lmt_addr) = aw0;
+		lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+	}
+#endif
+
+	/* wdata[0] will be always valid */
+	sso_lmt_aw_wait_fc(ws, sz0);
+	roc_lmt_submit_steorl(wdata[0], pa[0]);
+	if (wdata[1]) {
+		sso_lmt_aw_wait_fc(ws, sz1);
+		roc_lmt_submit_steorl(wdata[1], pa[1]);
+	}
+
+	left -= (sz0 + sz1);
+	if (left)
+		goto again;
+
+	return n;
+}
+
 uint16_t __rte_hot
 cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
 			uint16_t nb_events)
@@ -115,13 +392,32 @@ uint16_t __rte_hot
 cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
 			    uint16_t nb_events)
 {
+	uint16_t idx = 0, done = 0, rc = 0;
 	struct cn10k_sso_hws *ws = port;
-	uint16_t i, rc = 1;
+	uint8_t queue_id;
+	int32_t space;
+
+	/* Do a common back-pressure check and return */
+	space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
+	if (space <= 0)
+		return 0;
+	nb_events = space < nb_events ? space : nb_events;
+
+	do {
+		queue_id = ev[idx].queue_id;
+		for (idx = idx + 1; idx < nb_events; idx++)
+			if (queue_id != ev[idx].queue_id)
+				break;
+
+		rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
+						   idx - done);
+		if (rc != (idx - done))
+			return rc + done;
+		done += rc;

-	for (i = 0; i < nb_events && rc; i++)
-		rc = cn10k_sso_hws_new_event(ws, &ev[i]);
+	} while (done < nb_events);

-	return nb_events;
+	return done;
 }

 uint16_t __rte_hot
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7e8339bd3a..e59e537311 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	int rc;

-	rc = cnxk_sso_dev_validate(event_dev);
+	rc = cnxk_sso_dev_validate(event_dev, 1, 1);
 	if (rc < 0) {
 		plt_err("Invalid event device configuration");
 		return -EINVAL;
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index cb9ba5d353..99f9cdcd0d 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
 }

 int
-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
+		      uint32_t enq_depth)
 {
 	struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
 		return -EINVAL;
 	}

-	if (conf->nb_event_port_dequeue_depth > 1) {
+	if (conf->nb_event_port_dequeue_depth > deq_depth) {
 		plt_err("Unsupported event port deq depth requested");
 		return -EINVAL;
 	}

-	if (conf->nb_event_port_enqueue_depth > 1) {
+	if (conf->nb_event_port_enqueue_depth > enq_depth) {
 		plt_err("Unsupported event port enq depth requested");
 		return -EINVAL;
 	}
@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
 	}

 	dev = cnxk_sso_pmd_priv(event_dev);
+	dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
+					  PLT_CACHE_LINE_SIZE);
+	if (dev->fc_cache_space == NULL) {
+		plt_memzone_free(mz);
+		plt_err("Failed to reserve memory for XAQ fc cache");
+		return -ENOMEM;
+	}
+
 	pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
 	dev->sso.pci_dev = pci_dev;

diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index c7cbd722ab..a2f30bfe5f 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
 	uint32_t max_dequeue_timeout_ns;
 	int32_t max_num_events;
 	uint64_t xaq_lmt;
+	int64_t *fc_cache_space;
 	rte_iova_t fc_iova;
 	uint64_t rx_offloads;
 	uint64_t tx_offloads;
@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
 int cnxk_sso_remove(struct rte_pci_device *pci_dev);
 void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
 		       struct rte_event_dev_info *dev_info);
-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
+			  uint32_t deq_depth, uint32_t enq_depth);
 int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
 			   cnxk_sso_init_hws_mem_t init_hws_mem,
 			   cnxk_sso_hws_setup_t hws_setup);
--
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine
  2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
@ 2023-04-25 19:51   ` pbhagavatula
  2023-05-18 15:47     ` [EXT] " Shijith Thotton
  2023-04-25 19:51   ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 12+ messages in thread
From: pbhagavatula @ 2023-04-25 19:51 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use the `rte_event_enqueue_new_burst` routine to enqueue events
with rte_event::op as RTE_EVENT_OP_NEW. This allows PMDs to use
optimized enqueue routines.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test-eventdev/evt_options.c      |  2 +-
 app/test-eventdev/test_perf_common.c | 58 +++++++++++++++++-----------
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/app/test-eventdev/evt_options.c b/app/test-eventdev/evt_options.c
index b175c067cd..03fb3bfce0 100644
--- a/app/test-eventdev/evt_options.c
+++ b/app/test-eventdev/evt_options.c
@@ -27,7 +27,7 @@ evt_options_default(struct evt_options *opt)
 	opt->nb_flows = 1024;
 	opt->socket_id = SOCKET_ID_ANY;
 	opt->pool_sz = 16 * 1024;
-	opt->prod_enq_burst_sz = 1;
+	opt->prod_enq_burst_sz = 0;
 	opt->wkr_deq_dep = 16;
 	opt->nb_pkts = (1ULL << 26); /* do ~64M packets */
 	opt->nb_timers = 1E8;
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index fd434666cb..68af3cb346 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -131,8 +131,10 @@ perf_producer(void *arg)
 	uint32_t flow_counter = 0;
 	uint64_t count = 0;
 	struct perf_elt *m[BURST_SIZE + 1] = {NULL};
+	uint8_t enable_fwd_latency;
 	struct rte_event ev;
 
+	enable_fwd_latency = opt->fwd_latency;
 	if (opt->verbose_level > 1)
 		printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
 				rte_lcore_id(), dev_id, port, p->queue_id);
@@ -151,13 +153,16 @@ perf_producer(void *arg)
 		for (i = 0; i < BURST_SIZE; i++) {
 			ev.flow_id = flow_counter++ % nb_flows;
 			ev.event_ptr = m[i];
-			m[i]->timestamp = rte_get_timer_cycles();
-			while (rte_event_enqueue_burst(dev_id,
-						       port, &ev, 1) != 1) {
+			if (enable_fwd_latency)
+				m[i]->timestamp = rte_get_timer_cycles();
+			while (rte_event_enqueue_new_burst(dev_id, port, &ev,
+							   1) != 1) {
 				if (t->done)
 					break;
 				rte_pause();
-				m[i]->timestamp = rte_get_timer_cycles();
+				if (enable_fwd_latency)
+					m[i]->timestamp =
+						rte_get_timer_cycles();
 			}
 		}
 		count += BURST_SIZE;
@@ -171,7 +176,6 @@ perf_producer_burst(void *arg)
 {
 	uint32_t i;
 	uint64_t timestamp;
-	struct rte_event_dev_info dev_info;
 	struct prod_data *p  = arg;
 	struct test_perf *t = p->t;
 	struct evt_options *opt = t->opt;
@@ -183,15 +187,13 @@ perf_producer_burst(void *arg)
 	uint32_t flow_counter = 0;
 	uint16_t enq = 0;
 	uint64_t count = 0;
-	struct perf_elt *m[MAX_PROD_ENQ_BURST_SIZE + 1];
-	struct rte_event ev[MAX_PROD_ENQ_BURST_SIZE + 1];
+	struct perf_elt *m[opt->prod_enq_burst_sz + 1];
+	struct rte_event ev[opt->prod_enq_burst_sz + 1];
 	uint32_t burst_size = opt->prod_enq_burst_sz;
+	uint8_t enable_fwd_latency;
 
-	memset(m, 0, sizeof(*m) * (MAX_PROD_ENQ_BURST_SIZE + 1));
-	rte_event_dev_info_get(dev_id, &dev_info);
-	if (dev_info.max_event_port_enqueue_depth < burst_size)
-		burst_size = dev_info.max_event_port_enqueue_depth;
-
+	enable_fwd_latency = opt->fwd_latency;
+	memset(m, 0, sizeof(*m) * (opt->prod_enq_burst_sz + 1));
 	if (opt->verbose_level > 1)
 		printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
 				rte_lcore_id(), dev_id, port, p->queue_id);
@@ -212,19 +214,21 @@ perf_producer_burst(void *arg)
 		for (i = 0; i < burst_size; i++) {
 			ev[i].flow_id = flow_counter++ % nb_flows;
 			ev[i].event_ptr = m[i];
-			m[i]->timestamp = timestamp;
+			if (enable_fwd_latency)
+				m[i]->timestamp = timestamp;
 		}
-		enq = rte_event_enqueue_burst(dev_id, port, ev, burst_size);
+		enq = rte_event_enqueue_new_burst(dev_id, port, ev, burst_size);
 		while (enq < burst_size) {
-			enq += rte_event_enqueue_burst(dev_id, port,
-							ev + enq,
-							burst_size - enq);
+			enq += rte_event_enqueue_new_burst(
+				dev_id, port, ev + enq, burst_size - enq);
 			if (t->done)
 				break;
 			rte_pause();
-			timestamp = rte_get_timer_cycles();
-			for (i = enq; i < burst_size; i++)
-				m[i]->timestamp = timestamp;
+			if (enable_fwd_latency) {
+				timestamp = rte_get_timer_cycles();
+				for (i = enq; i < burst_size; i++)
+					m[i]->timestamp = timestamp;
+			}
 		}
 		count += burst_size;
 	}
@@ -799,9 +803,19 @@ perf_event_crypto_producer_burst(void *arg)
 static int
 perf_producer_wrapper(void *arg)
 {
+	struct rte_event_dev_info dev_info;
 	struct prod_data *p  = arg;
 	struct test_perf *t = p->t;
-	bool burst = evt_has_burst_mode(p->dev_id);
+
+	rte_event_dev_info_get(p->dev_id, &dev_info);
+	if (!t->opt->prod_enq_burst_sz) {
+		t->opt->prod_enq_burst_sz = MAX_PROD_ENQ_BURST_SIZE;
+		if (dev_info.max_event_port_enqueue_depth > 0 &&
+		    (uint32_t)dev_info.max_event_port_enqueue_depth <
+			    t->opt->prod_enq_burst_sz)
+			t->opt->prod_enq_burst_sz =
+				dev_info.max_event_port_enqueue_depth;
+	}
 
 	/* In case of synthetic producer, launch perf_producer or
 	 * perf_producer_burst depending on producer enqueue burst size
@@ -811,7 +825,7 @@ perf_producer_wrapper(void *arg)
 		return perf_producer(arg);
 	else if (t->opt->prod_type == EVT_PROD_TYPE_SYNT &&
 			t->opt->prod_enq_burst_sz > 1) {
-		if (!burst)
+		if (dev_info.max_event_port_enqueue_depth == 1)
 			evt_err("This event device does not support burst mode");
 		else
 			return perf_producer_burst(arg);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion
  2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
  2023-04-25 19:51   ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
@ 2023-04-25 19:51   ` pbhagavatula
  2023-05-18 15:47     ` [EXT] " Shijith Thotton
  2023-05-18 15:42   ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
  2023-05-22 11:56   ` [PATCH v3] " pbhagavatula
  3 siblings, 1 reply; 12+ messages in thread
From: pbhagavatula @ 2023-04-25 19:51 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Prevent mempool exhaustion due to elements being stuck in lcore
local caches.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test-eventdev/test_perf_common.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index 68af3cb346..5e0255cfeb 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -1859,34 +1859,35 @@ int
 perf_mempool_setup(struct evt_test *test, struct evt_options *opt)
 {
 	struct test_perf *t = evt_test_priv(test);
+	unsigned int cache_sz;
 
+	cache_sz = RTE_MIN(RTE_MEMPOOL_CACHE_MAX_SIZE, (opt->pool_sz / 1.5) / t->nb_workers);
 	if (opt->prod_type == EVT_PROD_TYPE_SYNT ||
 			opt->prod_type == EVT_PROD_TYPE_EVENT_TIMER_ADPTR) {
 		t->pool = rte_mempool_create(test->name, /* mempool name */
 				opt->pool_sz, /* number of elements*/
 				sizeof(struct perf_elt), /* element size*/
-				512, /* cache size*/
+				cache_sz, /* cache size*/
 				0, NULL, NULL,
 				perf_elt_init, /* obj constructor */
 				NULL, opt->socket_id, 0); /* flags */
 	} else if (opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR &&
-			opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC)  {
+		   opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
 		t->pool = rte_mempool_create(test->name, /* mempool name */
 				opt->pool_sz, /* number of elements*/
 				sizeof(struct perf_elt) + modex_test_case.result_len,
 				/* element size*/
-				512, /* cache size*/
+				cache_sz, /* cache size*/
 				0, NULL, NULL,
 				NULL, /* obj constructor */
 				NULL, opt->socket_id, 0); /* flags */
 	} else {
 		t->pool = rte_pktmbuf_pool_create(test->name, /* mempool name */
 				opt->pool_sz, /* number of elements*/
-				512, /* cache size*/
+				cache_sz, /* cache size*/
 				0,
 				RTE_MBUF_DEFAULT_BUF_SIZE,
 				opt->socket_id); /* flags */
-
 	}
 
 	if (t->pool == NULL) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst
  2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
  2023-04-25 19:51   ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
  2023-04-25 19:51   ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
@ 2023-05-18 15:42   ` Shijith Thotton
  2023-05-22  7:23     ` Jerin Jacob
  2023-05-22 11:56   ` [PATCH v3] " pbhagavatula
  3 siblings, 1 reply; 12+ messages in thread
From: Shijith Thotton @ 2023-05-18 15:42 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
	Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
	Sunil Kumar Kori, Satha Koteswara Rao Kottidi,
	Pavan Nikhilesh Bhagavatula
  Cc: dev

[-- Attachment #1: Type: text/plain, Size: 18882 bytes --]

>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Use LMTST when all events in the burst are enqueue with
>rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
>with the `rte_event_enqueue_new_burst` API.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
>v2 Changes:
>- Fix spell check.
>
> drivers/common/cnxk/hw/sso.h        |   1 +
> drivers/common/cnxk/roc_sso.c       |  10 +-
> drivers/common/cnxk/roc_sso.h       |   3 +
> drivers/event/cnxk/cn10k_eventdev.c |   9 +-
> drivers/event/cnxk/cn10k_eventdev.h |   6 +-
> drivers/event/cnxk/cn10k_worker.c   | 304 +++++++++++++++++++++++++++-
> drivers/event/cnxk/cn9k_eventdev.c  |   2 +-
> drivers/event/cnxk/cnxk_eventdev.c  |  15 +-
> drivers/event/cnxk/cnxk_eventdev.h  |   4 +-
> 9 files changed, 338 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
>index 25deaa4c14..09b8d4955f 100644
>--- a/drivers/common/cnxk/hw/sso.h
>+++ b/drivers/common/cnxk/hw/sso.h
>@@ -157,6 +157,7 @@
> #define SSO_LF_GGRP_AQ_CNT	 (0x1c0ull)
> #define SSO_LF_GGRP_AQ_THR	 (0x1e0ull)
> #define SSO_LF_GGRP_MISC_CNT	 (0x200ull)
>+#define SSO_LF_GGRP_OP_AW_LMTST	 (0x400ull)
>
> #define SSO_AF_IAQ_FREE_CNT_MASK      0x3FFFull
> #define SSO_AF_IAQ_RSVD_FREE_MASK     0x3FFFull
>diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
>index 4a6a5080f7..99a55e49b0 100644
>--- a/drivers/common/cnxk/roc_sso.c
>+++ b/drivers/common/cnxk/roc_sso.c
>@@ -6,6 +6,7 @@
> #include "roc_priv.h"
>
> #define SSO_XAQ_CACHE_CNT (0x7)
>+#define SSO_XAQ_SLACK	  (16)
>
> /* Private functions. */
> int
>@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
>roc_sso_xaq_data *xaq,
>
> 	xaq->nb_xae = nb_xae;
>
>-	/* Taken from HRM 14.3.3(4) */
>+	/** SSO will reserve up to 0x4 XAQ buffers per group when GetWork
>engine
>+	 * is inactive and it might prefetch an additional 0x3 buffers due to
>+	 * pipelining.
>+	 */
> 	xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
> 	xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq-
>>nb_xaq);
>+	xaq->nb_xaq += SSO_XAQ_SLACK;
>
> 	xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
> 	if (xaq->mem == NULL) {
>@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
>roc_sso_xaq_data *xaq,
> 	 * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
> 	 * to request XAQ to cache them even before enqueue is called.
> 	 */
>-	xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
>+	xaq->xaq_lmt =
>+		xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) -
>SSO_XAQ_SLACK;
>
> 	return 0;
> npa_fill_fail:
>diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
>index e67797b046..a2bb6fcb22 100644
>--- a/drivers/common/cnxk/roc_sso.h
>+++ b/drivers/common/cnxk/roc_sso.h
>@@ -7,6 +7,9 @@
>
> #include "hw/ssow.h"
>
>+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
>+#define ROC_SSO_XAE_PER_XAQ	     352
>+
> struct roc_sso_hwgrp_qos {
> 	uint16_t hwgrp;
> 	uint8_t xaq_prcnt;
>diff --git a/drivers/event/cnxk/cn10k_eventdev.c
>b/drivers/event/cnxk/cn10k_eventdev.c
>index 071ea5a212..855c92da83 100644
>--- a/drivers/event/cnxk/cn10k_eventdev.c
>+++ b/drivers/event/cnxk/cn10k_eventdev.c
>@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t
>grp_base)
> 	uint64_t val;
>
> 	ws->grp_base = grp_base;
>-	ws->fc_mem = (uint64_t *)dev->fc_iova;
>+	ws->fc_mem = (int64_t *)dev->fc_iova;
> 	ws->xaq_lmt = dev->xaq_lmt;
>+	ws->fc_cache_space = dev->fc_cache_space;
>+	ws->aw_lmt = ws->lmt_base;
>
> 	/* Set get_work timeout for HWS */
> 	val = NSEC2USEC(dev->deq_tmo_ns);
>@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
>
> 	dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
> 	cnxk_sso_info_get(dev, dev_info);
>+	dev_info->max_event_port_enqueue_depth = UINT32_MAX;
> }
>
> static int
>@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev
>*event_dev)
> 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> 	int rc;
>
>-	rc = cnxk_sso_dev_validate(event_dev);
>+	rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
> 	if (rc < 0) {
> 		plt_err("Invalid event device configuration");
> 		return -EINVAL;
>@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev
>*event_dev, void *lookup_mem)
> 	for (i = 0; i < dev->nb_event_ports; i++) {
> 		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
> 		ws->xaq_lmt = dev->xaq_lmt;
>-		ws->fc_mem = (uint64_t *)dev->fc_iova;
>+		ws->fc_mem = (int64_t *)dev->fc_iova;
> 		ws->tstamp = dev->tstamp;
> 		if (lookup_mem)
> 			ws->lookup_mem = lookup_mem;
>diff --git a/drivers/event/cnxk/cn10k_eventdev.h
>b/drivers/event/cnxk/cn10k_eventdev.h
>index aaa01d1ec1..29567728cd 100644
>--- a/drivers/event/cnxk/cn10k_eventdev.h
>+++ b/drivers/event/cnxk/cn10k_eventdev.h
>@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
> 	struct cnxk_timesync_info **tstamp;
> 	uint64_t meta_aura;
> 	/* Add Work Fastpath data */
>-	uint64_t xaq_lmt __rte_cache_aligned;
>-	uint64_t *fc_mem;
>+	int64_t *fc_mem __rte_cache_aligned;
>+	int64_t *fc_cache_space;
>+	uintptr_t aw_lmt;
> 	uintptr_t grp_base;
>+	int32_t xaq_lmt;
> 	/* Tx Fastpath data */
> 	uintptr_t lmt_base __rte_cache_aligned;
> 	uint64_t lso_tun_fmt;
>diff --git a/drivers/event/cnxk/cn10k_worker.c
>b/drivers/event/cnxk/cn10k_worker.c
>index 562d2fca13..1028a12c64 100644
>--- a/drivers/event/cnxk/cn10k_worker.c
>+++ b/drivers/event/cnxk/cn10k_worker.c
>@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
> 		cn10k_sso_hws_fwd_group(ws, ev, grp);
> }
>
>+static inline int32_t
>+sso_read_xaq_space(struct cn10k_sso_hws *ws)
>+{
>+	return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem,
>__ATOMIC_RELAXED)) *
>+	       ROC_SSO_XAE_PER_XAQ;
>+}
>+
>+static inline void
>+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
>+{
>+	int64_t cached, refill;
>+
>+retry:
>+	while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
>+		;
>+
>+	cached = __atomic_sub_fetch(ws->fc_cache_space, req,
>__ATOMIC_ACQUIRE);
>+	/* Check if there is enough space, else update and retry. */
>+	if (cached < 0) {
>+		/* Check if we have space else retry. */
>+		do {
>+			refill = sso_read_xaq_space(ws);
>+		} while (refill <= 0);
>+		__atomic_compare_exchange(ws->fc_cache_space, &cached,
>&refill,
>+					  0, __ATOMIC_RELEASE,
>+					  __ATOMIC_RELAXED);
>+		goto retry;
>+	}
>+}
>+
> uint16_t __rte_hot
> cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
> {
>@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event
>*ev)
> 	return 1;
> }
>
>+#define VECTOR_SIZE_BITS	     0xFFFFFFFFFFF80000ULL
>+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
>+
>+static uint64_t
>+vector_size_partial_mask(uint16_t off, uint16_t cnt)
>+{
>+	return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
>+	       ((uint64_t)(cnt - 1) << off);
>+}
>+
>+static __rte_always_inline uint16_t
>+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t
>queue_id,
>+			      const struct rte_event ev[], uint16_t n)
>+{
>+	uint16_t lines, partial_line, burst, left;
>+	uint64_t wdata[2], pa[2] = {0};
>+	uintptr_t lmt_addr;
>+	uint16_t sz0, sz1;
>+	uint16_t lmt_id;
>+
>+	sz0 = sz1 = 0;
>+	lmt_addr = ws->lmt_base;
>+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
>+
>+	left = n;
>+again:
>+	burst = RTE_MIN(
>+		BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 +
>ROC_LMT_LINES_PER_CORE_LOG2),
>+		left);
>+
>+	/* Set wdata */
>+	lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
>+	partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
>+	wdata[0] = wdata[1] = 0;
>+	if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
>+		wdata[0] = lmt_id;
>+		wdata[0] |= 15ULL << 12;
>+		wdata[0] |= VECTOR_SIZE_BITS;
>+		pa[0] = (ws->grp_base + (queue_id << 12) +
>+			 SSO_LF_GGRP_OP_AW_LMTST) |
>+			(0x7 << 4);
>+		sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
>+
>+		wdata[1] = lmt_id + 16;
>+		pa[1] = (ws->grp_base + (queue_id << 12) +
>+			 SSO_LF_GGRP_OP_AW_LMTST) |
>+			(0x7 << 4);
>+
>+		lines -= 17;
>+		wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
>+						 (uint64_t)(lines << 12);
>+		wdata[1] |= partial_line ?
>+				    vector_size_partial_mask(
>+					    VECTOR_GET_LINE_OFFSET(lines),
>+					    partial_line) :
>+				    VECTOR_SIZE_BITS;
>+		sz1 = burst - sz0;
>+		partial_line = 0;
>+	} else if (lines) {
>+		/* We need to handle two cases here:
>+		 * 1. Partial line spill over to wdata[1] i.e. lines == 16
>+		 * 2. Partial line with spill lines < 16.
>+		 */
>+		wdata[0] = lmt_id;
>+		pa[0] = (ws->grp_base + (queue_id << 12) +
>+			 SSO_LF_GGRP_OP_AW_LMTST) |
>+			(0x7 << 4);
>+		sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
>+		if (lines == 16) {
>+			wdata[0] |= 15ULL << 12;
>+			wdata[0] |= VECTOR_SIZE_BITS;
>+			if (partial_line) {
>+				wdata[1] = lmt_id + 16;
>+				pa[1] = (ws->grp_base + (queue_id << 12) +
>+					 SSO_LF_GGRP_OP_AW_LMTST) |
>+					((partial_line - 1) << 4);
>+			}
>+		} else {
>+			lines -= 1;
>+			wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
>+							 (uint64_t)(lines << 12);
>+			wdata[0] |=
>+				partial_line ?
>+					vector_size_partial_mask(
>+						VECTOR_GET_LINE_OFFSET(lines),
>+						partial_line) :
>+					VECTOR_SIZE_BITS;
>+			sz0 += partial_line;
>+		}
>+		sz1 = burst - sz0;
>+		partial_line = 0;
>+	}
>+
>+	/* Only partial lines */
>+	if (partial_line) {
>+		wdata[0] = lmt_id;
>+		pa[0] = (ws->grp_base + (queue_id << 12) +
>+			 SSO_LF_GGRP_OP_AW_LMTST) |
>+			((partial_line - 1) << 4);
>+		sz0 = partial_line;
>+		sz1 = burst - sz0;
>+	}
>+
>+#if defined(RTE_ARCH_ARM64)
>+	uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
>+	uint64x2_t tt_mask = {0x300000000ULL, 0};
>+	uint16_t parts;
>+
>+	while (burst) {
>+		parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
>+		burst -= parts;
>+		/* Lets try to fill at least one line per burst. */
>+		switch (parts) {
>+		case 8: {
>+			uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
>+
>+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
>+					aw_mask);
>+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
>+					aw_mask);
>+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
>+					aw_mask);
>+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
>+					aw_mask);
>+			aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
>+					aw_mask);
>+			aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
>+					aw_mask);
>+			aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
>+					aw_mask);
>+			aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
>+					aw_mask);
>+
>+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
>tt_mask),
>+					aw0);
>+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
>tt_mask),
>+					aw1);
>+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
>tt_mask),
>+					aw2);
>+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
>tt_mask),
>+					aw3);
>+			aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6),
>tt_mask),
>+					aw4);
>+			aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6),
>tt_mask),
>+					aw5);
>+			aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6),
>tt_mask),
>+					aw6);
>+			aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6),
>tt_mask),
>+					aw7);
>+
>+			vst1q_u64((void *)lmt_addr, aw0);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
>+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
>+		} break;
>+		case 4: {
>+			uint64x2_t aw0, aw1, aw2, aw3;
>+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
>+					aw_mask);
>+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
>+					aw_mask);
>+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
>+					aw_mask);
>+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
>+					aw_mask);
>+
>+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
>tt_mask),
>+					aw0);
>+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
>tt_mask),
>+					aw1);
>+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
>tt_mask),
>+					aw2);
>+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
>tt_mask),
>+					aw3);
>+
>+			vst1q_u64((void *)lmt_addr, aw0);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
>+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
>+		} break;
>+		case 2: {
>+			uint64x2_t aw0, aw1;
>+
>+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
>+					aw_mask);
>+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
>+					aw_mask);
>+
>+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
>tt_mask),
>+					aw0);
>+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
>tt_mask),
>+					aw1);
>+
>+			vst1q_u64((void *)lmt_addr, aw0);
>+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
>+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
>+		} break;
>+		case 1: {
>+			__uint128_t aw0;
>+
>+			aw0 = ev[0].u64;
>+			aw0 <<= 64;
>+			aw0 |= ev[0].event & (BIT_ULL(32) - 1);
>+			aw0 |= (uint64_t)ev[0].sched_type << 32;
>+
>+			*((__uint128_t *)lmt_addr) = aw0;
>+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
>+		} break;
>+		}
>+		ev += parts;
>+	}
>+#else
>+	uint16_t i;
>+
>+	for (i = 0; i < burst; i++) {
>+		__uint128_t aw0;
>+
>+		aw0 = ev[0].u64;
>+		aw0 <<= 64;
>+		aw0 |= ev[0].event & (BIT_ULL(32) - 1);
>+		aw0 |= (uint64_t)ev[0].sched_type << 32;
>+		*((__uint128_t *)lmt_addr) = aw0;
>+		lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
>+	}
>+#endif
>+
>+	/* wdata[0] will be always valid */
>+	sso_lmt_aw_wait_fc(ws, sz0);
>+	roc_lmt_submit_steorl(wdata[0], pa[0]);
>+	if (wdata[1]) {
>+		sso_lmt_aw_wait_fc(ws, sz1);
>+		roc_lmt_submit_steorl(wdata[1], pa[1]);
>+	}
>+
>+	left -= (sz0 + sz1);
>+	if (left)
>+		goto again;
>+
>+	return n;
>+}
>+
> uint16_t __rte_hot
> cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
> 			uint16_t nb_events)
>@@ -115,13 +392,32 @@ uint16_t __rte_hot
> cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
> 			    uint16_t nb_events)
> {
>+	uint16_t idx = 0, done = 0, rc = 0;
> 	struct cn10k_sso_hws *ws = port;
>-	uint16_t i, rc = 1;
>+	uint8_t queue_id;
>+	int32_t space;
>+
>+	/* Do a common back-pressure check and return */
>+	space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
>+	if (space <= 0)
>+		return 0;
>+	nb_events = space < nb_events ? space : nb_events;
>+
>+	do {
>+		queue_id = ev[idx].queue_id;
>+		for (idx = idx + 1; idx < nb_events; idx++)
>+			if (queue_id != ev[idx].queue_id)
>+				break;
>+
>+		rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
>+						   idx - done);
>+		if (rc != (idx - done))
>+			return rc + done;
>+		done += rc;
>
>-	for (i = 0; i < nb_events && rc; i++)
>-		rc = cn10k_sso_hws_new_event(ws, &ev[i]);
>+	} while (done < nb_events);
>
>-	return nb_events;
>+	return done;
> }
>
> uint16_t __rte_hot
>diff --git a/drivers/event/cnxk/cn9k_eventdev.c
>b/drivers/event/cnxk/cn9k_eventdev.c
>index 7e8339bd3a..e59e537311 100644
>--- a/drivers/event/cnxk/cn9k_eventdev.c
>+++ b/drivers/event/cnxk/cn9k_eventdev.c
>@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev
>*event_dev)
> 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> 	int rc;
>
>-	rc = cnxk_sso_dev_validate(event_dev);
>+	rc = cnxk_sso_dev_validate(event_dev, 1, 1);
> 	if (rc < 0) {
> 		plt_err("Invalid event device configuration");
> 		return -EINVAL;
>diff --git a/drivers/event/cnxk/cnxk_eventdev.c
>b/drivers/event/cnxk/cnxk_eventdev.c
>index cb9ba5d353..99f9cdcd0d 100644
>--- a/drivers/event/cnxk/cnxk_eventdev.c
>+++ b/drivers/event/cnxk/cnxk_eventdev.c
>@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev
>*event_dev,
> }
>
> int
>-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
>+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t
>deq_depth,
>+		      uint32_t enq_depth)
> {
> 	struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
> 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
>@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev
>*event_dev)
> 		return -EINVAL;
> 	}
>
>-	if (conf->nb_event_port_dequeue_depth > 1) {
>+	if (conf->nb_event_port_dequeue_depth > deq_depth) {
> 		plt_err("Unsupported event port deq depth requested");
> 		return -EINVAL;
> 	}
>
>-	if (conf->nb_event_port_enqueue_depth > 1) {
>+	if (conf->nb_event_port_enqueue_depth > enq_depth) {
> 		plt_err("Unsupported event port enq depth requested");
> 		return -EINVAL;
> 	}
>@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
> 	}
>
> 	dev = cnxk_sso_pmd_priv(event_dev);
>+	dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
>+					  PLT_CACHE_LINE_SIZE);
>+	if (dev->fc_cache_space == NULL) {
>+		plt_memzone_free(mz);
>+		plt_err("Failed to reserve memory for XAQ fc cache");
>+		return -ENOMEM;
>+	}
>+
> 	pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
> 	dev->sso.pci_dev = pci_dev;
>
>diff --git a/drivers/event/cnxk/cnxk_eventdev.h
>b/drivers/event/cnxk/cnxk_eventdev.h
>index c7cbd722ab..a2f30bfe5f 100644
>--- a/drivers/event/cnxk/cnxk_eventdev.h
>+++ b/drivers/event/cnxk/cnxk_eventdev.h
>@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
> 	uint32_t max_dequeue_timeout_ns;
> 	int32_t max_num_events;
> 	uint64_t xaq_lmt;
>+	int64_t *fc_cache_space;
> 	rte_iova_t fc_iova;
> 	uint64_t rx_offloads;
> 	uint64_t tx_offloads;
>@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
> int cnxk_sso_remove(struct rte_pci_device *pci_dev);
> void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
> 		       struct rte_event_dev_info *dev_info);
>-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
>+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
>+			  uint32_t deq_depth, uint32_t enq_depth);
> int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
> 			   cnxk_sso_init_hws_mem_t init_hws_mem,
> 			   cnxk_sso_hws_setup_t hws_setup);
>--
>2.25.1


[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 33599 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [EXT] [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine
  2023-04-25 19:51   ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
@ 2023-05-18 15:47     ` Shijith Thotton
  0 siblings, 0 replies; 12+ messages in thread
From: Shijith Thotton @ 2023-05-18 15:47 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula

>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Use the `rte_event_enqueue_new_burst` routine to enqueue events
>with rte_event::op as RTE_EVENT_OP_NEW. This allows PMDs to use
>optimized enqueue routines.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>[] 

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> app/test-eventdev/evt_options.c      |  2 +-
> app/test-eventdev/test_perf_common.c | 58 +++++++++++++++++-----------
> 2 files changed, 37 insertions(+), 23 deletions(-)
>
>diff --git a/app/test-eventdev/evt_options.c b/app/test-eventdev/evt_options.c
>index b175c067cd..03fb3bfce0 100644
>--- a/app/test-eventdev/evt_options.c
>+++ b/app/test-eventdev/evt_options.c
>@@ -27,7 +27,7 @@ evt_options_default(struct evt_options *opt)
> 	opt->nb_flows = 1024;
> 	opt->socket_id = SOCKET_ID_ANY;
> 	opt->pool_sz = 16 * 1024;
>-	opt->prod_enq_burst_sz = 1;
>+	opt->prod_enq_burst_sz = 0;
> 	opt->wkr_deq_dep = 16;
> 	opt->nb_pkts = (1ULL << 26); /* do ~64M packets */
> 	opt->nb_timers = 1E8;
>diff --git a/app/test-eventdev/test_perf_common.c b/app/test-
>eventdev/test_perf_common.c
>index fd434666cb..68af3cb346 100644
>--- a/app/test-eventdev/test_perf_common.c
>+++ b/app/test-eventdev/test_perf_common.c
>@@ -131,8 +131,10 @@ perf_producer(void *arg)
> 	uint32_t flow_counter = 0;
> 	uint64_t count = 0;
> 	struct perf_elt *m[BURST_SIZE + 1] = {NULL};
>+	uint8_t enable_fwd_latency;
> 	struct rte_event ev;
>
>+	enable_fwd_latency = opt->fwd_latency;
> 	if (opt->verbose_level > 1)
> 		printf("%s(): lcore %d dev_id %d port=%d queue %d\n",
>__func__,
> 				rte_lcore_id(), dev_id, port, p->queue_id);
>@@ -151,13 +153,16 @@ perf_producer(void *arg)
> 		for (i = 0; i < BURST_SIZE; i++) {
> 			ev.flow_id = flow_counter++ % nb_flows;
> 			ev.event_ptr = m[i];
>-			m[i]->timestamp = rte_get_timer_cycles();
>-			while (rte_event_enqueue_burst(dev_id,
>-						       port, &ev, 1) != 1) {
>+			if (enable_fwd_latency)
>+				m[i]->timestamp = rte_get_timer_cycles();
>+			while (rte_event_enqueue_new_burst(dev_id, port, &ev,
>+							   1) != 1) {
> 				if (t->done)
> 					break;
> 				rte_pause();
>-				m[i]->timestamp = rte_get_timer_cycles();
>+				if (enable_fwd_latency)
>+					m[i]->timestamp =
>+						rte_get_timer_cycles();
> 			}
> 		}
> 		count += BURST_SIZE;
>@@ -171,7 +176,6 @@ perf_producer_burst(void *arg)
> {
> 	uint32_t i;
> 	uint64_t timestamp;
>-	struct rte_event_dev_info dev_info;
> 	struct prod_data *p  = arg;
> 	struct test_perf *t = p->t;
> 	struct evt_options *opt = t->opt;
>@@ -183,15 +187,13 @@ perf_producer_burst(void *arg)
> 	uint32_t flow_counter = 0;
> 	uint16_t enq = 0;
> 	uint64_t count = 0;
>-	struct perf_elt *m[MAX_PROD_ENQ_BURST_SIZE + 1];
>-	struct rte_event ev[MAX_PROD_ENQ_BURST_SIZE + 1];
>+	struct perf_elt *m[opt->prod_enq_burst_sz + 1];
>+	struct rte_event ev[opt->prod_enq_burst_sz + 1];
> 	uint32_t burst_size = opt->prod_enq_burst_sz;
>+	uint8_t enable_fwd_latency;
>
>-	memset(m, 0, sizeof(*m) * (MAX_PROD_ENQ_BURST_SIZE + 1));
>-	rte_event_dev_info_get(dev_id, &dev_info);
>-	if (dev_info.max_event_port_enqueue_depth < burst_size)
>-		burst_size = dev_info.max_event_port_enqueue_depth;
>-
>+	enable_fwd_latency = opt->fwd_latency;
>+	memset(m, 0, sizeof(*m) * (opt->prod_enq_burst_sz + 1));
> 	if (opt->verbose_level > 1)
> 		printf("%s(): lcore %d dev_id %d port=%d queue %d\n",
>__func__,
> 				rte_lcore_id(), dev_id, port, p->queue_id);
>@@ -212,19 +214,21 @@ perf_producer_burst(void *arg)
> 		for (i = 0; i < burst_size; i++) {
> 			ev[i].flow_id = flow_counter++ % nb_flows;
> 			ev[i].event_ptr = m[i];
>-			m[i]->timestamp = timestamp;
>+			if (enable_fwd_latency)
>+				m[i]->timestamp = timestamp;
> 		}
>-		enq = rte_event_enqueue_burst(dev_id, port, ev, burst_size);
>+		enq = rte_event_enqueue_new_burst(dev_id, port, ev,
>burst_size);
> 		while (enq < burst_size) {
>-			enq += rte_event_enqueue_burst(dev_id, port,
>-							ev + enq,
>-							burst_size - enq);
>+			enq += rte_event_enqueue_new_burst(
>+				dev_id, port, ev + enq, burst_size - enq);
> 			if (t->done)
> 				break;
> 			rte_pause();
>-			timestamp = rte_get_timer_cycles();
>-			for (i = enq; i < burst_size; i++)
>-				m[i]->timestamp = timestamp;
>+			if (enable_fwd_latency) {
>+				timestamp = rte_get_timer_cycles();
>+				for (i = enq; i < burst_size; i++)
>+					m[i]->timestamp = timestamp;
>+			}
> 		}
> 		count += burst_size;
> 	}
>@@ -799,9 +803,19 @@ perf_event_crypto_producer_burst(void *arg)
> static int
> perf_producer_wrapper(void *arg)
> {
>+	struct rte_event_dev_info dev_info;
> 	struct prod_data *p  = arg;
> 	struct test_perf *t = p->t;
>-	bool burst = evt_has_burst_mode(p->dev_id);
>+
>+	rte_event_dev_info_get(p->dev_id, &dev_info);
>+	if (!t->opt->prod_enq_burst_sz) {
>+		t->opt->prod_enq_burst_sz = MAX_PROD_ENQ_BURST_SIZE;
>+		if (dev_info.max_event_port_enqueue_depth > 0 &&
>+		    (uint32_t)dev_info.max_event_port_enqueue_depth <
>+			    t->opt->prod_enq_burst_sz)
>+			t->opt->prod_enq_burst_sz =
>+				dev_info.max_event_port_enqueue_depth;
>+	}
>
> 	/* In case of synthetic producer, launch perf_producer or
> 	 * perf_producer_burst depending on producer enqueue burst size
>@@ -811,7 +825,7 @@ perf_producer_wrapper(void *arg)
> 		return perf_producer(arg);
> 	else if (t->opt->prod_type == EVT_PROD_TYPE_SYNT &&
> 			t->opt->prod_enq_burst_sz > 1) {
>-		if (!burst)
>+		if (dev_info.max_event_port_enqueue_depth == 1)
> 			evt_err("This event device does not support burst
>mode");
> 		else
> 			return perf_producer_burst(arg);
>--
>2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [EXT] [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion
  2023-04-25 19:51   ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
@ 2023-05-18 15:47     ` Shijith Thotton
  0 siblings, 0 replies; 12+ messages in thread
From: Shijith Thotton @ 2023-05-18 15:47 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula

>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Prevent mempool exhaustion due to elements being stuck in lcore
>local caches.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> app/test-eventdev/test_perf_common.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
>
>diff --git a/app/test-eventdev/test_perf_common.c b/app/test-
>eventdev/test_perf_common.c
>index 68af3cb346..5e0255cfeb 100644
>--- a/app/test-eventdev/test_perf_common.c
>+++ b/app/test-eventdev/test_perf_common.c
>@@ -1859,34 +1859,35 @@ int
> perf_mempool_setup(struct evt_test *test, struct evt_options *opt)
> {
> 	struct test_perf *t = evt_test_priv(test);
>+	unsigned int cache_sz;
>
>+	cache_sz = RTE_MIN(RTE_MEMPOOL_CACHE_MAX_SIZE, (opt->pool_sz /
>1.5) / t->nb_workers);
> 	if (opt->prod_type == EVT_PROD_TYPE_SYNT ||
> 			opt->prod_type ==
>EVT_PROD_TYPE_EVENT_TIMER_ADPTR) {
> 		t->pool = rte_mempool_create(test->name, /* mempool name */
> 				opt->pool_sz, /* number of elements*/
> 				sizeof(struct perf_elt), /* element size*/
>-				512, /* cache size*/
>+				cache_sz, /* cache size*/
> 				0, NULL, NULL,
> 				perf_elt_init, /* obj constructor */
> 				NULL, opt->socket_id, 0); /* flags */
> 	} else if (opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR &&
>-			opt->crypto_op_type ==
>RTE_CRYPTO_OP_TYPE_ASYMMETRIC)  {
>+		   opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC)
>{
> 		t->pool = rte_mempool_create(test->name, /* mempool name */
> 				opt->pool_sz, /* number of elements*/
> 				sizeof(struct perf_elt) +
>modex_test_case.result_len,
> 				/* element size*/
>-				512, /* cache size*/
>+				cache_sz, /* cache size*/
> 				0, NULL, NULL,
> 				NULL, /* obj constructor */
> 				NULL, opt->socket_id, 0); /* flags */
> 	} else {
> 		t->pool = rte_pktmbuf_pool_create(test->name, /* mempool
>name */
> 				opt->pool_sz, /* number of elements*/
>-				512, /* cache size*/
>+				cache_sz, /* cache size*/
> 				0,
> 				RTE_MBUF_DEFAULT_BUF_SIZE,
> 				opt->socket_id); /* flags */
>-
> 	}
>
> 	if (t->pool == NULL) {
>--
>2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst
  2023-05-18 15:42   ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
@ 2023-05-22  7:23     ` Jerin Jacob
  0 siblings, 0 replies; 12+ messages in thread
From: Jerin Jacob @ 2023-05-22  7:23 UTC (permalink / raw)
  To: Shijith Thotton
  Cc: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
	Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
	Sunil Kumar Kori, Satha Koteswara Rao Kottidi, dev

On Thu, May 18, 2023 at 9:12 PM Shijith Thotton <sthotton@marvell.com> wrote:
>
> >From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> >Use LMTST when all events in the burst are enqueue with
> >rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
> >with the `rte_event_enqueue_new_burst` API.
> >
> >Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Acked-by: Shijith Thotton <sthotton@marvell.com>


Applied 2/3 and 3/3. to dpdk-next-net-eventdev/for-main. Thanks

Please resend 1/3 to fix the following


### [PATCH] event/cnxk: use LMTST for enqueue new burst

Warning in drivers/event/cnxk/cn10k_worker.c:
Using __atomic_op_fetch, prefer __atomic_fetch_op

2/3 valid patches
checkpatch failed


>
> >---
> >v2 Changes:
> >- Fix spell check.
> >
> > drivers/common/cnxk/hw/sso.h        |   1 +
> > drivers/common/cnxk/roc_sso.c       |  10 +-
> > drivers/common/cnxk/roc_sso.h       |   3 +
> > drivers/event/cnxk/cn10k_eventdev.c |   9 +-
> > drivers/event/cnxk/cn10k_eventdev.h |   6 +-
> > drivers/event/cnxk/cn10k_worker.c   | 304 +++++++++++++++++++++++++++-
> > drivers/event/cnxk/cn9k_eventdev.c  |   2 +-
> > drivers/event/cnxk/cnxk_eventdev.c  |  15 +-
> > drivers/event/cnxk/cnxk_eventdev.h  |   4 +-
> > 9 files changed, 338 insertions(+), 16 deletions(-)
> >
> >diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
> >index 25deaa4c14..09b8d4955f 100644
> >--- a/drivers/common/cnxk/hw/sso.h
> >+++ b/drivers/common/cnxk/hw/sso.h
> >@@ -157,6 +157,7 @@
> > #define SSO_LF_GGRP_AQ_CNT     (0x1c0ull)
> > #define SSO_LF_GGRP_AQ_THR     (0x1e0ull)
> > #define SSO_LF_GGRP_MISC_CNT   (0x200ull)
> >+#define SSO_LF_GGRP_OP_AW_LMTST        (0x400ull)
> >
> > #define SSO_AF_IAQ_FREE_CNT_MASK      0x3FFFull
> > #define SSO_AF_IAQ_RSVD_FREE_MASK     0x3FFFull
> >diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
> >index 4a6a5080f7..99a55e49b0 100644
> >--- a/drivers/common/cnxk/roc_sso.c
> >+++ b/drivers/common/cnxk/roc_sso.c
> >@@ -6,6 +6,7 @@
> > #include "roc_priv.h"
> >
> > #define SSO_XAQ_CACHE_CNT (0x7)
> >+#define SSO_XAQ_SLACK   (16)
> >
> > /* Private functions. */
> > int
> >@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
> >roc_sso_xaq_data *xaq,
> >
> >       xaq->nb_xae = nb_xae;
> >
> >-      /* Taken from HRM 14.3.3(4) */
> >+      /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork
> >engine
> >+       * is inactive and it might prefetch an additional 0x3 buffers due to
> >+       * pipelining.
> >+       */
> >       xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
> >       xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq-
> >>nb_xaq);
> >+      xaq->nb_xaq += SSO_XAQ_SLACK;
> >
> >       xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
> >       if (xaq->mem == NULL) {
> >@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
> >roc_sso_xaq_data *xaq,
> >        * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
> >        * to request XAQ to cache them even before enqueue is called.
> >        */
> >-      xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
> >+      xaq->xaq_lmt =
> >+              xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) -
> >SSO_XAQ_SLACK;
> >
> >       return 0;
> > npa_fill_fail:
> >diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> >index e67797b046..a2bb6fcb22 100644
> >--- a/drivers/common/cnxk/roc_sso.h
> >+++ b/drivers/common/cnxk/roc_sso.h
> >@@ -7,6 +7,9 @@
> >
> > #include "hw/ssow.h"
> >
> >+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
> >+#define ROC_SSO_XAE_PER_XAQ        352
> >+
> > struct roc_sso_hwgrp_qos {
> >       uint16_t hwgrp;
> >       uint8_t xaq_prcnt;
> >diff --git a/drivers/event/cnxk/cn10k_eventdev.c
> >b/drivers/event/cnxk/cn10k_eventdev.c
> >index 071ea5a212..855c92da83 100644
> >--- a/drivers/event/cnxk/cn10k_eventdev.c
> >+++ b/drivers/event/cnxk/cn10k_eventdev.c
> >@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t
> >grp_base)
> >       uint64_t val;
> >
> >       ws->grp_base = grp_base;
> >-      ws->fc_mem = (uint64_t *)dev->fc_iova;
> >+      ws->fc_mem = (int64_t *)dev->fc_iova;
> >       ws->xaq_lmt = dev->xaq_lmt;
> >+      ws->fc_cache_space = dev->fc_cache_space;
> >+      ws->aw_lmt = ws->lmt_base;
> >
> >       /* Set get_work timeout for HWS */
> >       val = NSEC2USEC(dev->deq_tmo_ns);
> >@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
> >
> >       dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
> >       cnxk_sso_info_get(dev, dev_info);
> >+      dev_info->max_event_port_enqueue_depth = UINT32_MAX;
> > }
> >
> > static int
> >@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev
> >*event_dev)
> >       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> >       int rc;
> >
> >-      rc = cnxk_sso_dev_validate(event_dev);
> >+      rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
> >       if (rc < 0) {
> >               plt_err("Invalid event device configuration");
> >               return -EINVAL;
> >@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev
> >*event_dev, void *lookup_mem)
> >       for (i = 0; i < dev->nb_event_ports; i++) {
> >               struct cn10k_sso_hws *ws = event_dev->data->ports[i];
> >               ws->xaq_lmt = dev->xaq_lmt;
> >-              ws->fc_mem = (uint64_t *)dev->fc_iova;
> >+              ws->fc_mem = (int64_t *)dev->fc_iova;
> >               ws->tstamp = dev->tstamp;
> >               if (lookup_mem)
> >                       ws->lookup_mem = lookup_mem;
> >diff --git a/drivers/event/cnxk/cn10k_eventdev.h
> >b/drivers/event/cnxk/cn10k_eventdev.h
> >index aaa01d1ec1..29567728cd 100644
> >--- a/drivers/event/cnxk/cn10k_eventdev.h
> >+++ b/drivers/event/cnxk/cn10k_eventdev.h
> >@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
> >       struct cnxk_timesync_info **tstamp;
> >       uint64_t meta_aura;
> >       /* Add Work Fastpath data */
> >-      uint64_t xaq_lmt __rte_cache_aligned;
> >-      uint64_t *fc_mem;
> >+      int64_t *fc_mem __rte_cache_aligned;
> >+      int64_t *fc_cache_space;
> >+      uintptr_t aw_lmt;
> >       uintptr_t grp_base;
> >+      int32_t xaq_lmt;
> >       /* Tx Fastpath data */
> >       uintptr_t lmt_base __rte_cache_aligned;
> >       uint64_t lso_tun_fmt;
> >diff --git a/drivers/event/cnxk/cn10k_worker.c
> >b/drivers/event/cnxk/cn10k_worker.c
> >index 562d2fca13..1028a12c64 100644
> >--- a/drivers/event/cnxk/cn10k_worker.c
> >+++ b/drivers/event/cnxk/cn10k_worker.c
> >@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
> >               cn10k_sso_hws_fwd_group(ws, ev, grp);
> > }
> >
> >+static inline int32_t
> >+sso_read_xaq_space(struct cn10k_sso_hws *ws)
> >+{
> >+      return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem,
> >__ATOMIC_RELAXED)) *
> >+             ROC_SSO_XAE_PER_XAQ;
> >+}
> >+
> >+static inline void
> >+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
> >+{
> >+      int64_t cached, refill;
> >+
> >+retry:
> >+      while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
> >+              ;
> >+
> >+      cached = __atomic_sub_fetch(ws->fc_cache_space, req,
> >__ATOMIC_ACQUIRE);
> >+      /* Check if there is enough space, else update and retry. */
> >+      if (cached < 0) {
> >+              /* Check if we have space else retry. */
> >+              do {
> >+                      refill = sso_read_xaq_space(ws);
> >+              } while (refill <= 0);
> >+              __atomic_compare_exchange(ws->fc_cache_space, &cached,
> >&refill,
> >+                                        0, __ATOMIC_RELEASE,
> >+                                        __ATOMIC_RELAXED);
> >+              goto retry;
> >+      }
> >+}
> >+
> > uint16_t __rte_hot
> > cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
> > {
> >@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event
> >*ev)
> >       return 1;
> > }
> >
> >+#define VECTOR_SIZE_BITS           0xFFFFFFFFFFF80000ULL
> >+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
> >+
> >+static uint64_t
> >+vector_size_partial_mask(uint16_t off, uint16_t cnt)
> >+{
> >+      return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
> >+             ((uint64_t)(cnt - 1) << off);
> >+}
> >+
> >+static __rte_always_inline uint16_t
> >+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t
> >queue_id,
> >+                            const struct rte_event ev[], uint16_t n)
> >+{
> >+      uint16_t lines, partial_line, burst, left;
> >+      uint64_t wdata[2], pa[2] = {0};
> >+      uintptr_t lmt_addr;
> >+      uint16_t sz0, sz1;
> >+      uint16_t lmt_id;
> >+
> >+      sz0 = sz1 = 0;
> >+      lmt_addr = ws->lmt_base;
> >+      ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
> >+
> >+      left = n;
> >+again:
> >+      burst = RTE_MIN(
> >+              BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 +
> >ROC_LMT_LINES_PER_CORE_LOG2),
> >+              left);
> >+
> >+      /* Set wdata */
> >+      lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
> >+      partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
> >+      wdata[0] = wdata[1] = 0;
> >+      if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
> >+              wdata[0] = lmt_id;
> >+              wdata[0] |= 15ULL << 12;
> >+              wdata[0] |= VECTOR_SIZE_BITS;
> >+              pa[0] = (ws->grp_base + (queue_id << 12) +
> >+                       SSO_LF_GGRP_OP_AW_LMTST) |
> >+                      (0x7 << 4);
> >+              sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> >+
> >+              wdata[1] = lmt_id + 16;
> >+              pa[1] = (ws->grp_base + (queue_id << 12) +
> >+                       SSO_LF_GGRP_OP_AW_LMTST) |
> >+                      (0x7 << 4);
> >+
> >+              lines -= 17;
> >+              wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> >+                                               (uint64_t)(lines << 12);
> >+              wdata[1] |= partial_line ?
> >+                                  vector_size_partial_mask(
> >+                                          VECTOR_GET_LINE_OFFSET(lines),
> >+                                          partial_line) :
> >+                                  VECTOR_SIZE_BITS;
> >+              sz1 = burst - sz0;
> >+              partial_line = 0;
> >+      } else if (lines) {
> >+              /* We need to handle two cases here:
> >+               * 1. Partial line spill over to wdata[1] i.e. lines == 16
> >+               * 2. Partial line with spill lines < 16.
> >+               */
> >+              wdata[0] = lmt_id;
> >+              pa[0] = (ws->grp_base + (queue_id << 12) +
> >+                       SSO_LF_GGRP_OP_AW_LMTST) |
> >+                      (0x7 << 4);
> >+              sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> >+              if (lines == 16) {
> >+                      wdata[0] |= 15ULL << 12;
> >+                      wdata[0] |= VECTOR_SIZE_BITS;
> >+                      if (partial_line) {
> >+                              wdata[1] = lmt_id + 16;
> >+                              pa[1] = (ws->grp_base + (queue_id << 12) +
> >+                                       SSO_LF_GGRP_OP_AW_LMTST) |
> >+                                      ((partial_line - 1) << 4);
> >+                      }
> >+              } else {
> >+                      lines -= 1;
> >+                      wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> >+                                                       (uint64_t)(lines << 12);
> >+                      wdata[0] |=
> >+                              partial_line ?
> >+                                      vector_size_partial_mask(
> >+                                              VECTOR_GET_LINE_OFFSET(lines),
> >+                                              partial_line) :
> >+                                      VECTOR_SIZE_BITS;
> >+                      sz0 += partial_line;
> >+              }
> >+              sz1 = burst - sz0;
> >+              partial_line = 0;
> >+      }
> >+
> >+      /* Only partial lines */
> >+      if (partial_line) {
> >+              wdata[0] = lmt_id;
> >+              pa[0] = (ws->grp_base + (queue_id << 12) +
> >+                       SSO_LF_GGRP_OP_AW_LMTST) |
> >+                      ((partial_line - 1) << 4);
> >+              sz0 = partial_line;
> >+              sz1 = burst - sz0;
> >+      }
> >+
> >+#if defined(RTE_ARCH_ARM64)
> >+      uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
> >+      uint64x2_t tt_mask = {0x300000000ULL, 0};
> >+      uint16_t parts;
> >+
> >+      while (burst) {
> >+              parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
> >+              burst -= parts;
> >+              /* Lets try to fill at least one line per burst. */
> >+              switch (parts) {
> >+              case 8: {
> >+                      uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
> >+
> >+                      aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> >+                                      aw_mask);
> >+                      aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> >+                                      aw_mask);
> >+                      aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> >+                                      aw_mask);
> >+                      aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> >+                                      aw_mask);
> >+                      aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
> >+                                      aw_mask);
> >+                      aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
> >+                                      aw_mask);
> >+                      aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
> >+                                      aw_mask);
> >+                      aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
> >+                                      aw_mask);
> >+
> >+                      aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
> >tt_mask),
> >+                                      aw0);
> >+                      aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
> >tt_mask),
> >+                                      aw1);
> >+                      aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
> >tt_mask),
> >+                                      aw2);
> >+                      aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
> >tt_mask),
> >+                                      aw3);
> >+                      aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6),
> >tt_mask),
> >+                                      aw4);
> >+                      aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6),
> >tt_mask),
> >+                                      aw5);
> >+                      aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6),
> >tt_mask),
> >+                                      aw6);
> >+                      aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6),
> >tt_mask),
> >+                                      aw7);
> >+
> >+                      vst1q_u64((void *)lmt_addr, aw0);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
> >+                      lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
> >+              } break;
> >+              case 4: {
> >+                      uint64x2_t aw0, aw1, aw2, aw3;
> >+                      aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> >+                                      aw_mask);
> >+                      aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> >+                                      aw_mask);
> >+                      aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> >+                                      aw_mask);
> >+                      aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> >+                                      aw_mask);
> >+
> >+                      aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
> >tt_mask),
> >+                                      aw0);
> >+                      aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
> >tt_mask),
> >+                                      aw1);
> >+                      aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
> >tt_mask),
> >+                                      aw2);
> >+                      aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
> >tt_mask),
> >+                                      aw3);
> >+
> >+                      vst1q_u64((void *)lmt_addr, aw0);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> >+                      lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
> >+              } break;
> >+              case 2: {
> >+                      uint64x2_t aw0, aw1;
> >+
> >+                      aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> >+                                      aw_mask);
> >+                      aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> >+                                      aw_mask);
> >+
> >+                      aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
> >tt_mask),
> >+                                      aw0);
> >+                      aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
> >tt_mask),
> >+                                      aw1);
> >+
> >+                      vst1q_u64((void *)lmt_addr, aw0);
> >+                      vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> >+                      lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
> >+              } break;
> >+              case 1: {
> >+                      __uint128_t aw0;
> >+
> >+                      aw0 = ev[0].u64;
> >+                      aw0 <<= 64;
> >+                      aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> >+                      aw0 |= (uint64_t)ev[0].sched_type << 32;
> >+
> >+                      *((__uint128_t *)lmt_addr) = aw0;
> >+                      lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> >+              } break;
> >+              }
> >+              ev += parts;
> >+      }
> >+#else
> >+      uint16_t i;
> >+
> >+      for (i = 0; i < burst; i++) {
> >+              __uint128_t aw0;
> >+
> >+              aw0 = ev[0].u64;
> >+              aw0 <<= 64;
> >+              aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> >+              aw0 |= (uint64_t)ev[0].sched_type << 32;
> >+              *((__uint128_t *)lmt_addr) = aw0;
> >+              lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> >+      }
> >+#endif
> >+
> >+      /* wdata[0] will be always valid */
> >+      sso_lmt_aw_wait_fc(ws, sz0);
> >+      roc_lmt_submit_steorl(wdata[0], pa[0]);
> >+      if (wdata[1]) {
> >+              sso_lmt_aw_wait_fc(ws, sz1);
> >+              roc_lmt_submit_steorl(wdata[1], pa[1]);
> >+      }
> >+
> >+      left -= (sz0 + sz1);
> >+      if (left)
> >+              goto again;
> >+
> >+      return n;
> >+}
> >+
> > uint16_t __rte_hot
> > cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
> >                       uint16_t nb_events)
> >@@ -115,13 +392,32 @@ uint16_t __rte_hot
> > cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
> >                           uint16_t nb_events)
> > {
> >+      uint16_t idx = 0, done = 0, rc = 0;
> >       struct cn10k_sso_hws *ws = port;
> >-      uint16_t i, rc = 1;
> >+      uint8_t queue_id;
> >+      int32_t space;
> >+
> >+      /* Do a common back-pressure check and return */
> >+      space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
> >+      if (space <= 0)
> >+              return 0;
> >+      nb_events = space < nb_events ? space : nb_events;
> >+
> >+      do {
> >+              queue_id = ev[idx].queue_id;
> >+              for (idx = idx + 1; idx < nb_events; idx++)
> >+                      if (queue_id != ev[idx].queue_id)
> >+                              break;
> >+
> >+              rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
> >+                                                 idx - done);
> >+              if (rc != (idx - done))
> >+                      return rc + done;
> >+              done += rc;
> >
> >-      for (i = 0; i < nb_events && rc; i++)
> >-              rc = cn10k_sso_hws_new_event(ws, &ev[i]);
> >+      } while (done < nb_events);
> >
> >-      return nb_events;
> >+      return done;
> > }
> >
> > uint16_t __rte_hot
> >diff --git a/drivers/event/cnxk/cn9k_eventdev.c
> >b/drivers/event/cnxk/cn9k_eventdev.c
> >index 7e8339bd3a..e59e537311 100644
> >--- a/drivers/event/cnxk/cn9k_eventdev.c
> >+++ b/drivers/event/cnxk/cn9k_eventdev.c
> >@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev
> >*event_dev)
> >       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> >       int rc;
> >
> >-      rc = cnxk_sso_dev_validate(event_dev);
> >+      rc = cnxk_sso_dev_validate(event_dev, 1, 1);
> >       if (rc < 0) {
> >               plt_err("Invalid event device configuration");
> >               return -EINVAL;
> >diff --git a/drivers/event/cnxk/cnxk_eventdev.c
> >b/drivers/event/cnxk/cnxk_eventdev.c
> >index cb9ba5d353..99f9cdcd0d 100644
> >--- a/drivers/event/cnxk/cnxk_eventdev.c
> >+++ b/drivers/event/cnxk/cnxk_eventdev.c
> >@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev
> >*event_dev,
> > }
> >
> > int
> >-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
> >+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t
> >deq_depth,
> >+                    uint32_t enq_depth)
> > {
> >       struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
> >       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> >@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev
> >*event_dev)
> >               return -EINVAL;
> >       }
> >
> >-      if (conf->nb_event_port_dequeue_depth > 1) {
> >+      if (conf->nb_event_port_dequeue_depth > deq_depth) {
> >               plt_err("Unsupported event port deq depth requested");
> >               return -EINVAL;
> >       }
> >
> >-      if (conf->nb_event_port_enqueue_depth > 1) {
> >+      if (conf->nb_event_port_enqueue_depth > enq_depth) {
> >               plt_err("Unsupported event port enq depth requested");
> >               return -EINVAL;
> >       }
> >@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
> >       }
> >
> >       dev = cnxk_sso_pmd_priv(event_dev);
> >+      dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
> >+                                        PLT_CACHE_LINE_SIZE);
> >+      if (dev->fc_cache_space == NULL) {
> >+              plt_memzone_free(mz);
> >+              plt_err("Failed to reserve memory for XAQ fc cache");
> >+              return -ENOMEM;
> >+      }
> >+
> >       pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
> >       dev->sso.pci_dev = pci_dev;
> >
> >diff --git a/drivers/event/cnxk/cnxk_eventdev.h
> >b/drivers/event/cnxk/cnxk_eventdev.h
> >index c7cbd722ab..a2f30bfe5f 100644
> >--- a/drivers/event/cnxk/cnxk_eventdev.h
> >+++ b/drivers/event/cnxk/cnxk_eventdev.h
> >@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
> >       uint32_t max_dequeue_timeout_ns;
> >       int32_t max_num_events;
> >       uint64_t xaq_lmt;
> >+      int64_t *fc_cache_space;
> >       rte_iova_t fc_iova;
> >       uint64_t rx_offloads;
> >       uint64_t tx_offloads;
> >@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
> > int cnxk_sso_remove(struct rte_pci_device *pci_dev);
> > void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
> >                      struct rte_event_dev_info *dev_info);
> >-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
> >+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
> >+                        uint32_t deq_depth, uint32_t enq_depth);
> > int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
> >                          cnxk_sso_init_hws_mem_t init_hws_mem,
> >                          cnxk_sso_hws_setup_t hws_setup);
> >--
> >2.25.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3] event/cnxk: use LMTST for enqueue new burst
  2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
                     ` (2 preceding siblings ...)
  2023-05-18 15:42   ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
@ 2023-05-22 11:56   ` pbhagavatula
  2023-05-23  6:09     ` Jerin Jacob
  3 siblings, 1 reply; 12+ messages in thread
From: pbhagavatula @ 2023-05-22 11:56 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use LMTST when all events in the burst are enqueue with
rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
with the `rte_event_enqueue_new_burst` API.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
 v3 Changes:
 - Fix checkpatch issues.
 v2 Changes:
 - Fix spell check.

 drivers/common/cnxk/hw/sso.h        |   1 +
 drivers/common/cnxk/roc_sso.c       |  10 +-
 drivers/common/cnxk/roc_sso.h       |   3 +
 drivers/event/cnxk/cn10k_eventdev.c |   9 +-
 drivers/event/cnxk/cn10k_eventdev.h |   6 +-
 drivers/event/cnxk/cn10k_worker.c   | 304 +++++++++++++++++++++++++++-
 drivers/event/cnxk/cn9k_eventdev.c  |   2 +-
 drivers/event/cnxk/cnxk_eventdev.c  |  15 +-
 drivers/event/cnxk/cnxk_eventdev.h  |   4 +-
 9 files changed, 338 insertions(+), 16 deletions(-)

diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
index 25deaa4c14..09b8d4955f 100644
--- a/drivers/common/cnxk/hw/sso.h
+++ b/drivers/common/cnxk/hw/sso.h
@@ -157,6 +157,7 @@
 #define SSO_LF_GGRP_AQ_CNT	 (0x1c0ull)
 #define SSO_LF_GGRP_AQ_THR	 (0x1e0ull)
 #define SSO_LF_GGRP_MISC_CNT	 (0x200ull)
+#define SSO_LF_GGRP_OP_AW_LMTST	 (0x400ull)

 #define SSO_AF_IAQ_FREE_CNT_MASK      0x3FFFull
 #define SSO_AF_IAQ_RSVD_FREE_MASK     0x3FFFull
diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
index 4a6a5080f7..99a55e49b0 100644
--- a/drivers/common/cnxk/roc_sso.c
+++ b/drivers/common/cnxk/roc_sso.c
@@ -6,6 +6,7 @@
 #include "roc_priv.h"

 #define SSO_XAQ_CACHE_CNT (0x7)
+#define SSO_XAQ_SLACK	  (16)

 /* Private functions. */
 int
@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,

 	xaq->nb_xae = nb_xae;

-	/* Taken from HRM 14.3.3(4) */
+	/** SSO will reserve up to 0x4 XAQ buffers per group when GetWork engine
+	 * is inactive and it might prefetch an additional 0x3 buffers due to
+	 * pipelining.
+	 */
 	xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
 	xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
+	xaq->nb_xaq += SSO_XAQ_SLACK;

 	xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
 	if (xaq->mem == NULL) {
@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
 	 * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
 	 * to request XAQ to cache them even before enqueue is called.
 	 */
-	xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
+	xaq->xaq_lmt =
+		xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;

 	return 0;
 npa_fill_fail:
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index e67797b046..a2bb6fcb22 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -7,6 +7,9 @@

 #include "hw/ssow.h"

+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
+#define ROC_SSO_XAE_PER_XAQ	     352
+
 struct roc_sso_hwgrp_qos {
 	uint16_t hwgrp;
 	uint8_t xaq_prcnt;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 071ea5a212..855c92da83 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	uint64_t val;

 	ws->grp_base = grp_base;
-	ws->fc_mem = (uint64_t *)dev->fc_iova;
+	ws->fc_mem = (int64_t *)dev->fc_iova;
 	ws->xaq_lmt = dev->xaq_lmt;
+	ws->fc_cache_space = dev->fc_cache_space;
+	ws->aw_lmt = ws->lmt_base;

 	/* Set get_work timeout for HWS */
 	val = NSEC2USEC(dev->deq_tmo_ns);
@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,

 	dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
 	cnxk_sso_info_get(dev, dev_info);
+	dev_info->max_event_port_enqueue_depth = UINT32_MAX;
 }

 static int
@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	int rc;

-	rc = cnxk_sso_dev_validate(event_dev);
+	rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
 	if (rc < 0) {
 		plt_err("Invalid event device configuration");
 		return -EINVAL;
@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
 	for (i = 0; i < dev->nb_event_ports; i++) {
 		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
 		ws->xaq_lmt = dev->xaq_lmt;
-		ws->fc_mem = (uint64_t *)dev->fc_iova;
+		ws->fc_mem = (int64_t *)dev->fc_iova;
 		ws->tstamp = dev->tstamp;
 		if (lookup_mem)
 			ws->lookup_mem = lookup_mem;
diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
index aaa01d1ec1..29567728cd 100644
--- a/drivers/event/cnxk/cn10k_eventdev.h
+++ b/drivers/event/cnxk/cn10k_eventdev.h
@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
 	struct cnxk_timesync_info **tstamp;
 	uint64_t meta_aura;
 	/* Add Work Fastpath data */
-	uint64_t xaq_lmt __rte_cache_aligned;
-	uint64_t *fc_mem;
+	int64_t *fc_mem __rte_cache_aligned;
+	int64_t *fc_cache_space;
+	uintptr_t aw_lmt;
 	uintptr_t grp_base;
+	int32_t xaq_lmt;
 	/* Tx Fastpath data */
 	uintptr_t lmt_base __rte_cache_aligned;
 	uint64_t lso_tun_fmt;
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 562d2fca13..9b5bf90159 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 		cn10k_sso_hws_fwd_group(ws, ev, grp);
 }

+static inline int32_t
+sso_read_xaq_space(struct cn10k_sso_hws *ws)
+{
+	return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
+	       ROC_SSO_XAE_PER_XAQ;
+}
+
+static inline void
+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
+{
+	int64_t cached, refill;
+
+retry:
+	while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
+		;
+
+	cached = __atomic_fetch_sub(ws->fc_cache_space, req, __ATOMIC_ACQUIRE) - req;
+	/* Check if there is enough space, else update and retry. */
+	if (cached < 0) {
+		/* Check if we have space else retry. */
+		do {
+			refill = sso_read_xaq_space(ws);
+		} while (refill <= 0);
+		__atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
+					  0, __ATOMIC_RELEASE,
+					  __ATOMIC_RELAXED);
+		goto retry;
+	}
+}
+
 uint16_t __rte_hot
 cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 {
@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 	return 1;
 }

+#define VECTOR_SIZE_BITS	     0xFFFFFFFFFFF80000ULL
+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
+
+static uint64_t
+vector_size_partial_mask(uint16_t off, uint16_t cnt)
+{
+	return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
+	       ((uint64_t)(cnt - 1) << off);
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
+			      const struct rte_event ev[], uint16_t n)
+{
+	uint16_t lines, partial_line, burst, left;
+	uint64_t wdata[2], pa[2] = {0};
+	uintptr_t lmt_addr;
+	uint16_t sz0, sz1;
+	uint16_t lmt_id;
+
+	sz0 = sz1 = 0;
+	lmt_addr = ws->lmt_base;
+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+	left = n;
+again:
+	burst = RTE_MIN(
+		BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
+		left);
+
+	/* Set wdata */
+	lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
+	partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
+	wdata[0] = wdata[1] = 0;
+	if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
+		wdata[0] = lmt_id;
+		wdata[0] |= 15ULL << 12;
+		wdata[0] |= VECTOR_SIZE_BITS;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+		sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+
+		wdata[1] = lmt_id + 16;
+		pa[1] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+
+		lines -= 17;
+		wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+						 (uint64_t)(lines << 12);
+		wdata[1] |= partial_line ?
+				    vector_size_partial_mask(
+					    VECTOR_GET_LINE_OFFSET(lines),
+					    partial_line) :
+				    VECTOR_SIZE_BITS;
+		sz1 = burst - sz0;
+		partial_line = 0;
+	} else if (lines) {
+		/* We need to handle two cases here:
+		 * 1. Partial line spill over to wdata[1] i.e. lines == 16
+		 * 2. Partial line with spill lines < 16.
+		 */
+		wdata[0] = lmt_id;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			(0x7 << 4);
+		sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+		if (lines == 16) {
+			wdata[0] |= 15ULL << 12;
+			wdata[0] |= VECTOR_SIZE_BITS;
+			if (partial_line) {
+				wdata[1] = lmt_id + 16;
+				pa[1] = (ws->grp_base + (queue_id << 12) +
+					 SSO_LF_GGRP_OP_AW_LMTST) |
+					((partial_line - 1) << 4);
+			}
+		} else {
+			lines -= 1;
+			wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+							 (uint64_t)(lines << 12);
+			wdata[0] |=
+				partial_line ?
+					vector_size_partial_mask(
+						VECTOR_GET_LINE_OFFSET(lines),
+						partial_line) :
+					VECTOR_SIZE_BITS;
+			sz0 += partial_line;
+		}
+		sz1 = burst - sz0;
+		partial_line = 0;
+	}
+
+	/* Only partial lines */
+	if (partial_line) {
+		wdata[0] = lmt_id;
+		pa[0] = (ws->grp_base + (queue_id << 12) +
+			 SSO_LF_GGRP_OP_AW_LMTST) |
+			((partial_line - 1) << 4);
+		sz0 = partial_line;
+		sz1 = burst - sz0;
+	}
+
+#if defined(RTE_ARCH_ARM64)
+	uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
+	uint64x2_t tt_mask = {0x300000000ULL, 0};
+	uint16_t parts;
+
+	while (burst) {
+		parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
+		burst -= parts;
+		/* Lets try to fill at least one line per burst. */
+		switch (parts) {
+		case 8: {
+			uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
+
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+					aw_mask);
+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+					aw_mask);
+			aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
+					aw_mask);
+			aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
+					aw_mask);
+			aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
+					aw_mask);
+			aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+					aw2);
+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+					aw3);
+			aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
+					aw4);
+			aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
+					aw5);
+			aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
+					aw6);
+			aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
+					aw7);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
+		} break;
+		case 4: {
+			uint64x2_t aw0, aw1, aw2, aw3;
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+			aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+					aw_mask);
+			aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+			aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+					aw2);
+			aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+					aw3);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
+		} break;
+		case 2: {
+			uint64x2_t aw0, aw1;
+
+			aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+					aw_mask);
+			aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+					aw_mask);
+
+			aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+					aw0);
+			aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+					aw1);
+
+			vst1q_u64((void *)lmt_addr, aw0);
+			vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
+		} break;
+		case 1: {
+			__uint128_t aw0;
+
+			aw0 = ev[0].u64;
+			aw0 <<= 64;
+			aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+			aw0 |= (uint64_t)ev[0].sched_type << 32;
+
+			*((__uint128_t *)lmt_addr) = aw0;
+			lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+		} break;
+		}
+		ev += parts;
+	}
+#else
+	uint16_t i;
+
+	for (i = 0; i < burst; i++) {
+		__uint128_t aw0;
+
+		aw0 = ev[0].u64;
+		aw0 <<= 64;
+		aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+		aw0 |= (uint64_t)ev[0].sched_type << 32;
+		*((__uint128_t *)lmt_addr) = aw0;
+		lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+	}
+#endif
+
+	/* wdata[0] will be always valid */
+	sso_lmt_aw_wait_fc(ws, sz0);
+	roc_lmt_submit_steorl(wdata[0], pa[0]);
+	if (wdata[1]) {
+		sso_lmt_aw_wait_fc(ws, sz1);
+		roc_lmt_submit_steorl(wdata[1], pa[1]);
+	}
+
+	left -= (sz0 + sz1);
+	if (left)
+		goto again;
+
+	return n;
+}
+
 uint16_t __rte_hot
 cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
 			uint16_t nb_events)
@@ -115,13 +392,32 @@ uint16_t __rte_hot
 cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
 			    uint16_t nb_events)
 {
+	uint16_t idx = 0, done = 0, rc = 0;
 	struct cn10k_sso_hws *ws = port;
-	uint16_t i, rc = 1;
+	uint8_t queue_id;
+	int32_t space;
+
+	/* Do a common back-pressure check and return */
+	space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
+	if (space <= 0)
+		return 0;
+	nb_events = space < nb_events ? space : nb_events;
+
+	do {
+		queue_id = ev[idx].queue_id;
+		for (idx = idx + 1; idx < nb_events; idx++)
+			if (queue_id != ev[idx].queue_id)
+				break;
+
+		rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
+						   idx - done);
+		if (rc != (idx - done))
+			return rc + done;
+		done += rc;

-	for (i = 0; i < nb_events && rc; i++)
-		rc = cn10k_sso_hws_new_event(ws, &ev[i]);
+	} while (done < nb_events);

-	return nb_events;
+	return done;
 }

 uint16_t __rte_hot
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7e8339bd3a..e59e537311 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	int rc;

-	rc = cnxk_sso_dev_validate(event_dev);
+	rc = cnxk_sso_dev_validate(event_dev, 1, 1);
 	if (rc < 0) {
 		plt_err("Invalid event device configuration");
 		return -EINVAL;
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index cb9ba5d353..99f9cdcd0d 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
 }

 int
-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
+		      uint32_t enq_depth)
 {
 	struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
 		return -EINVAL;
 	}

-	if (conf->nb_event_port_dequeue_depth > 1) {
+	if (conf->nb_event_port_dequeue_depth > deq_depth) {
 		plt_err("Unsupported event port deq depth requested");
 		return -EINVAL;
 	}

-	if (conf->nb_event_port_enqueue_depth > 1) {
+	if (conf->nb_event_port_enqueue_depth > enq_depth) {
 		plt_err("Unsupported event port enq depth requested");
 		return -EINVAL;
 	}
@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
 	}

 	dev = cnxk_sso_pmd_priv(event_dev);
+	dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
+					  PLT_CACHE_LINE_SIZE);
+	if (dev->fc_cache_space == NULL) {
+		plt_memzone_free(mz);
+		plt_err("Failed to reserve memory for XAQ fc cache");
+		return -ENOMEM;
+	}
+
 	pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
 	dev->sso.pci_dev = pci_dev;

diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index c7cbd722ab..a2f30bfe5f 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
 	uint32_t max_dequeue_timeout_ns;
 	int32_t max_num_events;
 	uint64_t xaq_lmt;
+	int64_t *fc_cache_space;
 	rte_iova_t fc_iova;
 	uint64_t rx_offloads;
 	uint64_t tx_offloads;
@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
 int cnxk_sso_remove(struct rte_pci_device *pci_dev);
 void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
 		       struct rte_event_dev_info *dev_info);
-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
+			  uint32_t deq_depth, uint32_t enq_depth);
 int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
 			   cnxk_sso_init_hws_mem_t init_hws_mem,
 			   cnxk_sso_hws_setup_t hws_setup);
--
2.25.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] event/cnxk: use LMTST for enqueue new burst
  2023-05-22 11:56   ` [PATCH v3] " pbhagavatula
@ 2023-05-23  6:09     ` Jerin Jacob
  0 siblings, 0 replies; 12+ messages in thread
From: Jerin Jacob @ 2023-05-23  6:09 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Shijith Thotton, dev

On Mon, May 22, 2023 at 5:26 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use LMTST when all events in the burst are enqueue with
> rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
> with the `rte_event_enqueue_new_burst` API.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Acked-by: Shijith Thotton <sthotton@marvell.com>


Applied to dpdk-next-net-eventdev/for-main. Thanks

> ---
>  v3 Changes:
>  - Fix checkpatch issues.
>  v2 Changes:
>  - Fix spell check.
>
>  drivers/common/cnxk/hw/sso.h        |   1 +
>  drivers/common/cnxk/roc_sso.c       |  10 +-
>  drivers/common/cnxk/roc_sso.h       |   3 +
>  drivers/event/cnxk/cn10k_eventdev.c |   9 +-
>  drivers/event/cnxk/cn10k_eventdev.h |   6 +-
>  drivers/event/cnxk/cn10k_worker.c   | 304 +++++++++++++++++++++++++++-
>  drivers/event/cnxk/cn9k_eventdev.c  |   2 +-
>  drivers/event/cnxk/cnxk_eventdev.c  |  15 +-
>  drivers/event/cnxk/cnxk_eventdev.h  |   4 +-
>  9 files changed, 338 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
> index 25deaa4c14..09b8d4955f 100644
> --- a/drivers/common/cnxk/hw/sso.h
> +++ b/drivers/common/cnxk/hw/sso.h
> @@ -157,6 +157,7 @@
>  #define SSO_LF_GGRP_AQ_CNT      (0x1c0ull)
>  #define SSO_LF_GGRP_AQ_THR      (0x1e0ull)
>  #define SSO_LF_GGRP_MISC_CNT    (0x200ull)
> +#define SSO_LF_GGRP_OP_AW_LMTST         (0x400ull)
>
>  #define SSO_AF_IAQ_FREE_CNT_MASK      0x3FFFull
>  #define SSO_AF_IAQ_RSVD_FREE_MASK     0x3FFFull
> diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
> index 4a6a5080f7..99a55e49b0 100644
> --- a/drivers/common/cnxk/roc_sso.c
> +++ b/drivers/common/cnxk/roc_sso.c
> @@ -6,6 +6,7 @@
>  #include "roc_priv.h"
>
>  #define SSO_XAQ_CACHE_CNT (0x7)
> +#define SSO_XAQ_SLACK    (16)
>
>  /* Private functions. */
>  int
> @@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
>
>         xaq->nb_xae = nb_xae;
>
> -       /* Taken from HRM 14.3.3(4) */
> +       /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork engine
> +        * is inactive and it might prefetch an additional 0x3 buffers due to
> +        * pipelining.
> +        */
>         xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
>         xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
> +       xaq->nb_xaq += SSO_XAQ_SLACK;
>
>         xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
>         if (xaq->mem == NULL) {
> @@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
>          * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
>          * to request XAQ to cache them even before enqueue is called.
>          */
> -       xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
> +       xaq->xaq_lmt =
> +               xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;
>
>         return 0;
>  npa_fill_fail:
> diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> index e67797b046..a2bb6fcb22 100644
> --- a/drivers/common/cnxk/roc_sso.h
> +++ b/drivers/common/cnxk/roc_sso.h
> @@ -7,6 +7,9 @@
>
>  #include "hw/ssow.h"
>
> +#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
> +#define ROC_SSO_XAE_PER_XAQ         352
> +
>  struct roc_sso_hwgrp_qos {
>         uint16_t hwgrp;
>         uint8_t xaq_prcnt;
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index 071ea5a212..855c92da83 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
>         uint64_t val;
>
>         ws->grp_base = grp_base;
> -       ws->fc_mem = (uint64_t *)dev->fc_iova;
> +       ws->fc_mem = (int64_t *)dev->fc_iova;
>         ws->xaq_lmt = dev->xaq_lmt;
> +       ws->fc_cache_space = dev->fc_cache_space;
> +       ws->aw_lmt = ws->lmt_base;
>
>         /* Set get_work timeout for HWS */
>         val = NSEC2USEC(dev->deq_tmo_ns);
> @@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
>
>         dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
>         cnxk_sso_info_get(dev, dev_info);
> +       dev_info->max_event_port_enqueue_depth = UINT32_MAX;
>  }
>
>  static int
> @@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
>         struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
>         int rc;
>
> -       rc = cnxk_sso_dev_validate(event_dev);
> +       rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
>         if (rc < 0) {
>                 plt_err("Invalid event device configuration");
>                 return -EINVAL;
> @@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
>         for (i = 0; i < dev->nb_event_ports; i++) {
>                 struct cn10k_sso_hws *ws = event_dev->data->ports[i];
>                 ws->xaq_lmt = dev->xaq_lmt;
> -               ws->fc_mem = (uint64_t *)dev->fc_iova;
> +               ws->fc_mem = (int64_t *)dev->fc_iova;
>                 ws->tstamp = dev->tstamp;
>                 if (lookup_mem)
>                         ws->lookup_mem = lookup_mem;
> diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
> index aaa01d1ec1..29567728cd 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.h
> +++ b/drivers/event/cnxk/cn10k_eventdev.h
> @@ -19,9 +19,11 @@ struct cn10k_sso_hws {
>         struct cnxk_timesync_info **tstamp;
>         uint64_t meta_aura;
>         /* Add Work Fastpath data */
> -       uint64_t xaq_lmt __rte_cache_aligned;
> -       uint64_t *fc_mem;
> +       int64_t *fc_mem __rte_cache_aligned;
> +       int64_t *fc_cache_space;
> +       uintptr_t aw_lmt;
>         uintptr_t grp_base;
> +       int32_t xaq_lmt;
>         /* Tx Fastpath data */
>         uintptr_t lmt_base __rte_cache_aligned;
>         uint64_t lso_tun_fmt;
> diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
> index 562d2fca13..9b5bf90159 100644
> --- a/drivers/event/cnxk/cn10k_worker.c
> +++ b/drivers/event/cnxk/cn10k_worker.c
> @@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
>                 cn10k_sso_hws_fwd_group(ws, ev, grp);
>  }
>
> +static inline int32_t
> +sso_read_xaq_space(struct cn10k_sso_hws *ws)
> +{
> +       return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
> +              ROC_SSO_XAE_PER_XAQ;
> +}
> +
> +static inline void
> +sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
> +{
> +       int64_t cached, refill;
> +
> +retry:
> +       while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
> +               ;
> +
> +       cached = __atomic_fetch_sub(ws->fc_cache_space, req, __ATOMIC_ACQUIRE) - req;
> +       /* Check if there is enough space, else update and retry. */
> +       if (cached < 0) {
> +               /* Check if we have space else retry. */
> +               do {
> +                       refill = sso_read_xaq_space(ws);
> +               } while (refill <= 0);
> +               __atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
> +                                         0, __ATOMIC_RELEASE,
> +                                         __ATOMIC_RELAXED);
> +               goto retry;
> +       }
> +}
> +
>  uint16_t __rte_hot
>  cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
>  {
> @@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
>         return 1;
>  }
>
> +#define VECTOR_SIZE_BITS            0xFFFFFFFFFFF80000ULL
> +#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
> +
> +static uint64_t
> +vector_size_partial_mask(uint16_t off, uint16_t cnt)
> +{
> +       return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
> +              ((uint64_t)(cnt - 1) << off);
> +}
> +
> +static __rte_always_inline uint16_t
> +cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
> +                             const struct rte_event ev[], uint16_t n)
> +{
> +       uint16_t lines, partial_line, burst, left;
> +       uint64_t wdata[2], pa[2] = {0};
> +       uintptr_t lmt_addr;
> +       uint16_t sz0, sz1;
> +       uint16_t lmt_id;
> +
> +       sz0 = sz1 = 0;
> +       lmt_addr = ws->lmt_base;
> +       ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
> +
> +       left = n;
> +again:
> +       burst = RTE_MIN(
> +               BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
> +               left);
> +
> +       /* Set wdata */
> +       lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
> +       partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
> +       wdata[0] = wdata[1] = 0;
> +       if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
> +               wdata[0] = lmt_id;
> +               wdata[0] |= 15ULL << 12;
> +               wdata[0] |= VECTOR_SIZE_BITS;
> +               pa[0] = (ws->grp_base + (queue_id << 12) +
> +                        SSO_LF_GGRP_OP_AW_LMTST) |
> +                       (0x7 << 4);
> +               sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> +
> +               wdata[1] = lmt_id + 16;
> +               pa[1] = (ws->grp_base + (queue_id << 12) +
> +                        SSO_LF_GGRP_OP_AW_LMTST) |
> +                       (0x7 << 4);
> +
> +               lines -= 17;
> +               wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> +                                                (uint64_t)(lines << 12);
> +               wdata[1] |= partial_line ?
> +                                   vector_size_partial_mask(
> +                                           VECTOR_GET_LINE_OFFSET(lines),
> +                                           partial_line) :
> +                                   VECTOR_SIZE_BITS;
> +               sz1 = burst - sz0;
> +               partial_line = 0;
> +       } else if (lines) {
> +               /* We need to handle two cases here:
> +                * 1. Partial line spill over to wdata[1] i.e. lines == 16
> +                * 2. Partial line with spill lines < 16.
> +                */
> +               wdata[0] = lmt_id;
> +               pa[0] = (ws->grp_base + (queue_id << 12) +
> +                        SSO_LF_GGRP_OP_AW_LMTST) |
> +                       (0x7 << 4);
> +               sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> +               if (lines == 16) {
> +                       wdata[0] |= 15ULL << 12;
> +                       wdata[0] |= VECTOR_SIZE_BITS;
> +                       if (partial_line) {
> +                               wdata[1] = lmt_id + 16;
> +                               pa[1] = (ws->grp_base + (queue_id << 12) +
> +                                        SSO_LF_GGRP_OP_AW_LMTST) |
> +                                       ((partial_line - 1) << 4);
> +                       }
> +               } else {
> +                       lines -= 1;
> +                       wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> +                                                        (uint64_t)(lines << 12);
> +                       wdata[0] |=
> +                               partial_line ?
> +                                       vector_size_partial_mask(
> +                                               VECTOR_GET_LINE_OFFSET(lines),
> +                                               partial_line) :
> +                                       VECTOR_SIZE_BITS;
> +                       sz0 += partial_line;
> +               }
> +               sz1 = burst - sz0;
> +               partial_line = 0;
> +       }
> +
> +       /* Only partial lines */
> +       if (partial_line) {
> +               wdata[0] = lmt_id;
> +               pa[0] = (ws->grp_base + (queue_id << 12) +
> +                        SSO_LF_GGRP_OP_AW_LMTST) |
> +                       ((partial_line - 1) << 4);
> +               sz0 = partial_line;
> +               sz1 = burst - sz0;
> +       }
> +
> +#if defined(RTE_ARCH_ARM64)
> +       uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
> +       uint64x2_t tt_mask = {0x300000000ULL, 0};
> +       uint16_t parts;
> +
> +       while (burst) {
> +               parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
> +               burst -= parts;
> +               /* Lets try to fill at least one line per burst. */
> +               switch (parts) {
> +               case 8: {
> +                       uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
> +
> +                       aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> +                                       aw_mask);
> +                       aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> +                                       aw_mask);
> +                       aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> +                                       aw_mask);
> +                       aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> +                                       aw_mask);
> +                       aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
> +                                       aw_mask);
> +                       aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
> +                                       aw_mask);
> +                       aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
> +                                       aw_mask);
> +                       aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
> +                                       aw_mask);
> +
> +                       aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
> +                                       aw0);
> +                       aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
> +                                       aw1);
> +                       aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
> +                                       aw2);
> +                       aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
> +                                       aw3);
> +                       aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
> +                                       aw4);
> +                       aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
> +                                       aw5);
> +                       aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
> +                                       aw6);
> +                       aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
> +                                       aw7);
> +
> +                       vst1q_u64((void *)lmt_addr, aw0);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
> +                       lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
> +               } break;
> +               case 4: {
> +                       uint64x2_t aw0, aw1, aw2, aw3;
> +                       aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> +                                       aw_mask);
> +                       aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> +                                       aw_mask);
> +                       aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> +                                       aw_mask);
> +                       aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> +                                       aw_mask);
> +
> +                       aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
> +                                       aw0);
> +                       aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
> +                                       aw1);
> +                       aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
> +                                       aw2);
> +                       aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
> +                                       aw3);
> +
> +                       vst1q_u64((void *)lmt_addr, aw0);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> +                       lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
> +               } break;
> +               case 2: {
> +                       uint64x2_t aw0, aw1;
> +
> +                       aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> +                                       aw_mask);
> +                       aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> +                                       aw_mask);
> +
> +                       aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
> +                                       aw0);
> +                       aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
> +                                       aw1);
> +
> +                       vst1q_u64((void *)lmt_addr, aw0);
> +                       vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> +                       lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
> +               } break;
> +               case 1: {
> +                       __uint128_t aw0;
> +
> +                       aw0 = ev[0].u64;
> +                       aw0 <<= 64;
> +                       aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> +                       aw0 |= (uint64_t)ev[0].sched_type << 32;
> +
> +                       *((__uint128_t *)lmt_addr) = aw0;
> +                       lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> +               } break;
> +               }
> +               ev += parts;
> +       }
> +#else
> +       uint16_t i;
> +
> +       for (i = 0; i < burst; i++) {
> +               __uint128_t aw0;
> +
> +               aw0 = ev[0].u64;
> +               aw0 <<= 64;
> +               aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> +               aw0 |= (uint64_t)ev[0].sched_type << 32;
> +               *((__uint128_t *)lmt_addr) = aw0;
> +               lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> +       }
> +#endif
> +
> +       /* wdata[0] will be always valid */
> +       sso_lmt_aw_wait_fc(ws, sz0);
> +       roc_lmt_submit_steorl(wdata[0], pa[0]);
> +       if (wdata[1]) {
> +               sso_lmt_aw_wait_fc(ws, sz1);
> +               roc_lmt_submit_steorl(wdata[1], pa[1]);
> +       }
> +
> +       left -= (sz0 + sz1);
> +       if (left)
> +               goto again;
> +
> +       return n;
> +}
> +
>  uint16_t __rte_hot
>  cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
>                         uint16_t nb_events)
> @@ -115,13 +392,32 @@ uint16_t __rte_hot
>  cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
>                             uint16_t nb_events)
>  {
> +       uint16_t idx = 0, done = 0, rc = 0;
>         struct cn10k_sso_hws *ws = port;
> -       uint16_t i, rc = 1;
> +       uint8_t queue_id;
> +       int32_t space;
> +
> +       /* Do a common back-pressure check and return */
> +       space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
> +       if (space <= 0)
> +               return 0;
> +       nb_events = space < nb_events ? space : nb_events;
> +
> +       do {
> +               queue_id = ev[idx].queue_id;
> +               for (idx = idx + 1; idx < nb_events; idx++)
> +                       if (queue_id != ev[idx].queue_id)
> +                               break;
> +
> +               rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
> +                                                  idx - done);
> +               if (rc != (idx - done))
> +                       return rc + done;
> +               done += rc;
>
> -       for (i = 0; i < nb_events && rc; i++)
> -               rc = cn10k_sso_hws_new_event(ws, &ev[i]);
> +       } while (done < nb_events);
>
> -       return nb_events;
> +       return done;
>  }
>
>  uint16_t __rte_hot
> diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
> index 7e8339bd3a..e59e537311 100644
> --- a/drivers/event/cnxk/cn9k_eventdev.c
> +++ b/drivers/event/cnxk/cn9k_eventdev.c
> @@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
>         struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
>         int rc;
>
> -       rc = cnxk_sso_dev_validate(event_dev);
> +       rc = cnxk_sso_dev_validate(event_dev, 1, 1);
>         if (rc < 0) {
>                 plt_err("Invalid event device configuration");
>                 return -EINVAL;
> diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
> index cb9ba5d353..99f9cdcd0d 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.c
> +++ b/drivers/event/cnxk/cnxk_eventdev.c
> @@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
>  }
>
>  int
> -cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
> +cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
> +                     uint32_t enq_depth)
>  {
>         struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
>         struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> @@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
>                 return -EINVAL;
>         }
>
> -       if (conf->nb_event_port_dequeue_depth > 1) {
> +       if (conf->nb_event_port_dequeue_depth > deq_depth) {
>                 plt_err("Unsupported event port deq depth requested");
>                 return -EINVAL;
>         }
>
> -       if (conf->nb_event_port_enqueue_depth > 1) {
> +       if (conf->nb_event_port_enqueue_depth > enq_depth) {
>                 plt_err("Unsupported event port enq depth requested");
>                 return -EINVAL;
>         }
> @@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
>         }
>
>         dev = cnxk_sso_pmd_priv(event_dev);
> +       dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
> +                                         PLT_CACHE_LINE_SIZE);
> +       if (dev->fc_cache_space == NULL) {
> +               plt_memzone_free(mz);
> +               plt_err("Failed to reserve memory for XAQ fc cache");
> +               return -ENOMEM;
> +       }
> +
>         pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
>         dev->sso.pci_dev = pci_dev;
>
> diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
> index c7cbd722ab..a2f30bfe5f 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.h
> +++ b/drivers/event/cnxk/cnxk_eventdev.h
> @@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
>         uint32_t max_dequeue_timeout_ns;
>         int32_t max_num_events;
>         uint64_t xaq_lmt;
> +       int64_t *fc_cache_space;
>         rte_iova_t fc_iova;
>         uint64_t rx_offloads;
>         uint64_t tx_offloads;
> @@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
>  int cnxk_sso_remove(struct rte_pci_device *pci_dev);
>  void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
>                        struct rte_event_dev_info *dev_info);
> -int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
> +int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
> +                         uint32_t deq_depth, uint32_t enq_depth);
>  int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
>                            cnxk_sso_init_hws_mem_t init_hws_mem,
>                            cnxk_sso_hws_setup_t hws_setup);
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-05-23  6:09 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
2023-04-19 20:01 ` [PATCH 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-25 19:51   ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
2023-05-18 15:47     ` [EXT] " Shijith Thotton
2023-04-25 19:51   ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
2023-05-18 15:47     ` [EXT] " Shijith Thotton
2023-05-18 15:42   ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
2023-05-22  7:23     ` Jerin Jacob
2023-05-22 11:56   ` [PATCH v3] " pbhagavatula
2023-05-23  6:09     ` Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).