* [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst
@ 2023-04-19 20:01 pbhagavatula
2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-19 20:01 UTC (permalink / raw)
To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
Satha Rao, Pavan Nikhilesh, Shijith Thotton
Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use LMTST when all events in the burst are enqueue with
rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
with the `rte_event_enqueue_new_burst` API.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/common/cnxk/hw/sso.h | 1 +
drivers/common/cnxk/roc_sso.c | 10 +-
drivers/common/cnxk/roc_sso.h | 3 +
drivers/event/cnxk/cn10k_eventdev.c | 9 +-
drivers/event/cnxk/cn10k_eventdev.h | 6 +-
drivers/event/cnxk/cn10k_worker.c | 304 +++++++++++++++++++++++++++-
drivers/event/cnxk/cn9k_eventdev.c | 2 +-
drivers/event/cnxk/cnxk_eventdev.c | 15 +-
drivers/event/cnxk/cnxk_eventdev.h | 4 +-
9 files changed, 338 insertions(+), 16 deletions(-)
diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
index 25deaa4c14..09b8d4955f 100644
--- a/drivers/common/cnxk/hw/sso.h
+++ b/drivers/common/cnxk/hw/sso.h
@@ -157,6 +157,7 @@
#define SSO_LF_GGRP_AQ_CNT (0x1c0ull)
#define SSO_LF_GGRP_AQ_THR (0x1e0ull)
#define SSO_LF_GGRP_MISC_CNT (0x200ull)
+#define SSO_LF_GGRP_OP_AW_LMTST (0x400ull)
#define SSO_AF_IAQ_FREE_CNT_MASK 0x3FFFull
#define SSO_AF_IAQ_RSVD_FREE_MASK 0x3FFFull
diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
index 4a6a5080f7..78e98bbdd6 100644
--- a/drivers/common/cnxk/roc_sso.c
+++ b/drivers/common/cnxk/roc_sso.c
@@ -6,6 +6,7 @@
#include "roc_priv.h"
#define SSO_XAQ_CACHE_CNT (0x7)
+#define SSO_XAQ_SLACK (16)
/* Private functions. */
int
@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
xaq->nb_xae = nb_xae;
- /* Taken from HRM 14.3.3(4) */
+ /** SSO will reserve upto 0x4 XAQ buffers per group when GetWork engine
+ * is inactive and it might prefetch an additional 0x3 buffers due to
+ * pipelining.
+ */
xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
+ xaq->nb_xaq += SSO_XAQ_SLACK;
xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
if (xaq->mem == NULL) {
@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
* There should be a minimum headroom of 7 XAQs per HWGRP for SSO
* to request XAQ to cache them even before enqueue is called.
*/
- xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
+ xaq->xaq_lmt =
+ xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;
return 0;
npa_fill_fail:
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index e67797b046..a2bb6fcb22 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -7,6 +7,9 @@
#include "hw/ssow.h"
+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
+#define ROC_SSO_XAE_PER_XAQ 352
+
struct roc_sso_hwgrp_qos {
uint16_t hwgrp;
uint8_t xaq_prcnt;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 071ea5a212..855c92da83 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
uint64_t val;
ws->grp_base = grp_base;
- ws->fc_mem = (uint64_t *)dev->fc_iova;
+ ws->fc_mem = (int64_t *)dev->fc_iova;
ws->xaq_lmt = dev->xaq_lmt;
+ ws->fc_cache_space = dev->fc_cache_space;
+ ws->aw_lmt = ws->lmt_base;
/* Set get_work timeout for HWS */
val = NSEC2USEC(dev->deq_tmo_ns);
@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
cnxk_sso_info_get(dev, dev_info);
+ dev_info->max_event_port_enqueue_depth = UINT32_MAX;
}
static int
@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
int rc;
- rc = cnxk_sso_dev_validate(event_dev);
+ rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
if (rc < 0) {
plt_err("Invalid event device configuration");
return -EINVAL;
@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
for (i = 0; i < dev->nb_event_ports; i++) {
struct cn10k_sso_hws *ws = event_dev->data->ports[i];
ws->xaq_lmt = dev->xaq_lmt;
- ws->fc_mem = (uint64_t *)dev->fc_iova;
+ ws->fc_mem = (int64_t *)dev->fc_iova;
ws->tstamp = dev->tstamp;
if (lookup_mem)
ws->lookup_mem = lookup_mem;
diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
index aaa01d1ec1..29567728cd 100644
--- a/drivers/event/cnxk/cn10k_eventdev.h
+++ b/drivers/event/cnxk/cn10k_eventdev.h
@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
struct cnxk_timesync_info **tstamp;
uint64_t meta_aura;
/* Add Work Fastpath data */
- uint64_t xaq_lmt __rte_cache_aligned;
- uint64_t *fc_mem;
+ int64_t *fc_mem __rte_cache_aligned;
+ int64_t *fc_cache_space;
+ uintptr_t aw_lmt;
uintptr_t grp_base;
+ int32_t xaq_lmt;
/* Tx Fastpath data */
uintptr_t lmt_base __rte_cache_aligned;
uint64_t lso_tun_fmt;
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 562d2fca13..ba3ea4bb35 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
cn10k_sso_hws_fwd_group(ws, ev, grp);
}
+static inline int32_t
+sso_read_xaq_space(struct cn10k_sso_hws *ws)
+{
+ return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
+ ROC_SSO_XAE_PER_XAQ;
+}
+
+static inline void
+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
+{
+ int64_t cached, refill;
+
+retry:
+ while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
+ ;
+
+ cached = __atomic_sub_fetch(ws->fc_cache_space, req, __ATOMIC_ACQUIRE);
+ /* Check if there is enough space, else update and retry. */
+ if (cached < 0) {
+ /* Check if we have space else retry. */
+ do {
+ refill = sso_read_xaq_space(ws);
+ } while (refill <= 0);
+ __atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
+ 0, __ATOMIC_RELEASE,
+ __ATOMIC_RELAXED);
+ goto retry;
+ }
+}
+
uint16_t __rte_hot
cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
{
@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
return 1;
}
+#define VECTOR_SIZE_BITS 0xFFFFFFFFFFF80000ULL
+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
+
+static uint64_t
+vector_size_partial_mask(uint16_t off, uint16_t cnt)
+{
+ return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
+ ((uint64_t)(cnt - 1) << off);
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
+ const struct rte_event ev[], uint16_t n)
+{
+ uint16_t lines, partial_line, burst, left;
+ uint64_t wdata[2], pa[2] = {0};
+ uintptr_t lmt_addr;
+ uint16_t sz0, sz1;
+ uint16_t lmt_id;
+
+ sz0 = sz1 = 0;
+ lmt_addr = ws->lmt_base;
+ ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+ left = n;
+again:
+ burst = RTE_MIN(
+ BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
+ left);
+
+ /* Set wdata */
+ lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
+ partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
+ wdata[0] = wdata[1] = 0;
+ if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
+ wdata[0] = lmt_id;
+ wdata[0] |= 15ULL << 12;
+ wdata[0] |= VECTOR_SIZE_BITS;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+ sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+
+ wdata[1] = lmt_id + 16;
+ pa[1] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+
+ lines -= 17;
+ wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+ (uint64_t)(lines << 12);
+ wdata[1] |= partial_line ?
+ vector_size_partial_mask(
+ VECTOR_GET_LINE_OFFSET(lines),
+ partial_line) :
+ VECTOR_SIZE_BITS;
+ sz1 = burst - sz0;
+ partial_line = 0;
+ } else if (lines) {
+ /* We need to handle two cases here:
+ * 1. Partial line spill over to wdata[1] i.e. lines == 16
+ * 2. Partial line with spill lines < 16.
+ */
+ wdata[0] = lmt_id;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+ sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+ if (lines == 16) {
+ wdata[0] |= 15ULL << 12;
+ wdata[0] |= VECTOR_SIZE_BITS;
+ if (partial_line) {
+ wdata[1] = lmt_id + 16;
+ pa[1] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ ((partial_line - 1) << 4);
+ }
+ } else {
+ lines -= 1;
+ wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+ (uint64_t)(lines << 12);
+ wdata[0] |=
+ partial_line ?
+ vector_size_partial_mask(
+ VECTOR_GET_LINE_OFFSET(lines),
+ partial_line) :
+ VECTOR_SIZE_BITS;
+ sz0 += partial_line;
+ }
+ sz1 = burst - sz0;
+ partial_line = 0;
+ }
+
+ /* Only partial lines */
+ if (partial_line) {
+ wdata[0] = lmt_id;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ ((partial_line - 1) << 4);
+ sz0 = partial_line;
+ sz1 = burst - sz0;
+ }
+
+#if defined(RTE_ARCH_ARM64)
+ uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
+ uint64x2_t tt_mask = {0x300000000ULL, 0};
+ uint16_t parts;
+
+ while (burst) {
+ parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
+ burst -= parts;
+ /* Lets try to fill atleast one line per burst. */
+ switch (parts) {
+ case 8: {
+ uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
+
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+ aw_mask);
+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+ aw_mask);
+ aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
+ aw_mask);
+ aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
+ aw_mask);
+ aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
+ aw_mask);
+ aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+ aw2);
+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+ aw3);
+ aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
+ aw4);
+ aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
+ aw5);
+ aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
+ aw6);
+ aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
+ aw7);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
+ } break;
+ case 4: {
+ uint64x2_t aw0, aw1, aw2, aw3;
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+ aw_mask);
+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+ aw2);
+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+ aw3);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
+ } break;
+ case 2: {
+ uint64x2_t aw0, aw1;
+
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
+ } break;
+ case 1: {
+ __uint128_t aw0;
+
+ aw0 = ev[0].u64;
+ aw0 <<= 64;
+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+ aw0 |= (uint64_t)ev[0].sched_type << 32;
+
+ *((__uint128_t *)lmt_addr) = aw0;
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+ } break;
+ }
+ ev += parts;
+ }
+#else
+ uint16_t i;
+
+ for (i = 0; i < burst; i++) {
+ __uint128_t aw0;
+
+ aw0 = ev[0].u64;
+ aw0 <<= 64;
+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+ aw0 |= (uint64_t)ev[0].sched_type << 32;
+ *((__uint128_t *)lmt_addr) = aw0;
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+ }
+#endif
+
+ /* wdata[0] will be always valid */
+ sso_lmt_aw_wait_fc(ws, sz0);
+ roc_lmt_submit_steorl(wdata[0], pa[0]);
+ if (wdata[1]) {
+ sso_lmt_aw_wait_fc(ws, sz1);
+ roc_lmt_submit_steorl(wdata[1], pa[1]);
+ }
+
+ left -= (sz0 + sz1);
+ if (left)
+ goto again;
+
+ return n;
+}
+
uint16_t __rte_hot
cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
uint16_t nb_events)
@@ -115,13 +392,32 @@ uint16_t __rte_hot
cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
uint16_t nb_events)
{
+ uint16_t idx = 0, done = 0, rc = 0;
struct cn10k_sso_hws *ws = port;
- uint16_t i, rc = 1;
+ uint8_t queue_id;
+ int32_t space;
+
+ /* Do a common back-pressure check and return */
+ space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
+ if (space <= 0)
+ return 0;
+ nb_events = space < nb_events ? space : nb_events;
+
+ do {
+ queue_id = ev[idx].queue_id;
+ for (idx = idx + 1; idx < nb_events; idx++)
+ if (queue_id != ev[idx].queue_id)
+ break;
+
+ rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
+ idx - done);
+ if (rc != (idx - done))
+ return rc + done;
+ done += rc;
- for (i = 0; i < nb_events && rc; i++)
- rc = cn10k_sso_hws_new_event(ws, &ev[i]);
+ } while (done < nb_events);
- return nb_events;
+ return done;
}
uint16_t __rte_hot
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7e8339bd3a..e59e537311 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
int rc;
- rc = cnxk_sso_dev_validate(event_dev);
+ rc = cnxk_sso_dev_validate(event_dev, 1, 1);
if (rc < 0) {
plt_err("Invalid event device configuration");
return -EINVAL;
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index cb9ba5d353..99f9cdcd0d 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
}
int
-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
+ uint32_t enq_depth)
{
struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
return -EINVAL;
}
- if (conf->nb_event_port_dequeue_depth > 1) {
+ if (conf->nb_event_port_dequeue_depth > deq_depth) {
plt_err("Unsupported event port deq depth requested");
return -EINVAL;
}
- if (conf->nb_event_port_enqueue_depth > 1) {
+ if (conf->nb_event_port_enqueue_depth > enq_depth) {
plt_err("Unsupported event port enq depth requested");
return -EINVAL;
}
@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
}
dev = cnxk_sso_pmd_priv(event_dev);
+ dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
+ PLT_CACHE_LINE_SIZE);
+ if (dev->fc_cache_space == NULL) {
+ plt_memzone_free(mz);
+ plt_err("Failed to reserve memory for XAQ fc cache");
+ return -ENOMEM;
+ }
+
pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
dev->sso.pci_dev = pci_dev;
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index c7cbd722ab..a2f30bfe5f 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
uint32_t max_dequeue_timeout_ns;
int32_t max_num_events;
uint64_t xaq_lmt;
+ int64_t *fc_cache_space;
rte_iova_t fc_iova;
uint64_t rx_offloads;
uint64_t tx_offloads;
@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
int cnxk_sso_remove(struct rte_pci_device *pci_dev);
void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
struct rte_event_dev_info *dev_info);
-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
+ uint32_t deq_depth, uint32_t enq_depth);
int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
cnxk_sso_init_hws_mem_t init_hws_mem,
cnxk_sso_hws_setup_t hws_setup);
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 2/3] app/eventdev: use enqueue new event burst routine
2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
@ 2023-04-19 20:01 ` pbhagavatula
2023-04-19 20:01 ` [PATCH 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2 siblings, 0 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-19 20:01 UTC (permalink / raw)
To: jerinj; +Cc: dev, Pavan Nikhilesh
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use the `rte_event_enqueue_new_burst` routine to enqueue events
with rte_event::op as RTE_EVENT_OP_NEW. This allows PMDs to use
optimized enqueue routines.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
app/test-eventdev/evt_options.c | 2 +-
app/test-eventdev/test_perf_common.c | 58 +++++++++++++++++-----------
2 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/app/test-eventdev/evt_options.c b/app/test-eventdev/evt_options.c
index b175c067cd..03fb3bfce0 100644
--- a/app/test-eventdev/evt_options.c
+++ b/app/test-eventdev/evt_options.c
@@ -27,7 +27,7 @@ evt_options_default(struct evt_options *opt)
opt->nb_flows = 1024;
opt->socket_id = SOCKET_ID_ANY;
opt->pool_sz = 16 * 1024;
- opt->prod_enq_burst_sz = 1;
+ opt->prod_enq_burst_sz = 0;
opt->wkr_deq_dep = 16;
opt->nb_pkts = (1ULL << 26); /* do ~64M packets */
opt->nb_timers = 1E8;
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index fd434666cb..68af3cb346 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -131,8 +131,10 @@ perf_producer(void *arg)
uint32_t flow_counter = 0;
uint64_t count = 0;
struct perf_elt *m[BURST_SIZE + 1] = {NULL};
+ uint8_t enable_fwd_latency;
struct rte_event ev;
+ enable_fwd_latency = opt->fwd_latency;
if (opt->verbose_level > 1)
printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
rte_lcore_id(), dev_id, port, p->queue_id);
@@ -151,13 +153,16 @@ perf_producer(void *arg)
for (i = 0; i < BURST_SIZE; i++) {
ev.flow_id = flow_counter++ % nb_flows;
ev.event_ptr = m[i];
- m[i]->timestamp = rte_get_timer_cycles();
- while (rte_event_enqueue_burst(dev_id,
- port, &ev, 1) != 1) {
+ if (enable_fwd_latency)
+ m[i]->timestamp = rte_get_timer_cycles();
+ while (rte_event_enqueue_new_burst(dev_id, port, &ev,
+ 1) != 1) {
if (t->done)
break;
rte_pause();
- m[i]->timestamp = rte_get_timer_cycles();
+ if (enable_fwd_latency)
+ m[i]->timestamp =
+ rte_get_timer_cycles();
}
}
count += BURST_SIZE;
@@ -171,7 +176,6 @@ perf_producer_burst(void *arg)
{
uint32_t i;
uint64_t timestamp;
- struct rte_event_dev_info dev_info;
struct prod_data *p = arg;
struct test_perf *t = p->t;
struct evt_options *opt = t->opt;
@@ -183,15 +187,13 @@ perf_producer_burst(void *arg)
uint32_t flow_counter = 0;
uint16_t enq = 0;
uint64_t count = 0;
- struct perf_elt *m[MAX_PROD_ENQ_BURST_SIZE + 1];
- struct rte_event ev[MAX_PROD_ENQ_BURST_SIZE + 1];
+ struct perf_elt *m[opt->prod_enq_burst_sz + 1];
+ struct rte_event ev[opt->prod_enq_burst_sz + 1];
uint32_t burst_size = opt->prod_enq_burst_sz;
+ uint8_t enable_fwd_latency;
- memset(m, 0, sizeof(*m) * (MAX_PROD_ENQ_BURST_SIZE + 1));
- rte_event_dev_info_get(dev_id, &dev_info);
- if (dev_info.max_event_port_enqueue_depth < burst_size)
- burst_size = dev_info.max_event_port_enqueue_depth;
-
+ enable_fwd_latency = opt->fwd_latency;
+ memset(m, 0, sizeof(*m) * (opt->prod_enq_burst_sz + 1));
if (opt->verbose_level > 1)
printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
rte_lcore_id(), dev_id, port, p->queue_id);
@@ -212,19 +214,21 @@ perf_producer_burst(void *arg)
for (i = 0; i < burst_size; i++) {
ev[i].flow_id = flow_counter++ % nb_flows;
ev[i].event_ptr = m[i];
- m[i]->timestamp = timestamp;
+ if (enable_fwd_latency)
+ m[i]->timestamp = timestamp;
}
- enq = rte_event_enqueue_burst(dev_id, port, ev, burst_size);
+ enq = rte_event_enqueue_new_burst(dev_id, port, ev, burst_size);
while (enq < burst_size) {
- enq += rte_event_enqueue_burst(dev_id, port,
- ev + enq,
- burst_size - enq);
+ enq += rte_event_enqueue_new_burst(
+ dev_id, port, ev + enq, burst_size - enq);
if (t->done)
break;
rte_pause();
- timestamp = rte_get_timer_cycles();
- for (i = enq; i < burst_size; i++)
- m[i]->timestamp = timestamp;
+ if (enable_fwd_latency) {
+ timestamp = rte_get_timer_cycles();
+ for (i = enq; i < burst_size; i++)
+ m[i]->timestamp = timestamp;
+ }
}
count += burst_size;
}
@@ -799,9 +803,19 @@ perf_event_crypto_producer_burst(void *arg)
static int
perf_producer_wrapper(void *arg)
{
+ struct rte_event_dev_info dev_info;
struct prod_data *p = arg;
struct test_perf *t = p->t;
- bool burst = evt_has_burst_mode(p->dev_id);
+
+ rte_event_dev_info_get(p->dev_id, &dev_info);
+ if (!t->opt->prod_enq_burst_sz) {
+ t->opt->prod_enq_burst_sz = MAX_PROD_ENQ_BURST_SIZE;
+ if (dev_info.max_event_port_enqueue_depth > 0 &&
+ (uint32_t)dev_info.max_event_port_enqueue_depth <
+ t->opt->prod_enq_burst_sz)
+ t->opt->prod_enq_burst_sz =
+ dev_info.max_event_port_enqueue_depth;
+ }
/* In case of synthetic producer, launch perf_producer or
* perf_producer_burst depending on producer enqueue burst size
@@ -811,7 +825,7 @@ perf_producer_wrapper(void *arg)
return perf_producer(arg);
else if (t->opt->prod_type == EVT_PROD_TYPE_SYNT &&
t->opt->prod_enq_burst_sz > 1) {
- if (!burst)
+ if (dev_info.max_event_port_enqueue_depth == 1)
evt_err("This event device does not support burst mode");
else
return perf_producer_burst(arg);
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/3] app/eventdev: prevent mempool exhaustion
2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
@ 2023-04-19 20:01 ` pbhagavatula
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2 siblings, 0 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-19 20:01 UTC (permalink / raw)
To: jerinj; +Cc: dev, Pavan Nikhilesh
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Prevent mempool exhaustion due to elements being stuck in lcore
local caches.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
app/test-eventdev/test_perf_common.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index 68af3cb346..5e0255cfeb 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -1859,34 +1859,35 @@ int
perf_mempool_setup(struct evt_test *test, struct evt_options *opt)
{
struct test_perf *t = evt_test_priv(test);
+ unsigned int cache_sz;
+ cache_sz = RTE_MIN(RTE_MEMPOOL_CACHE_MAX_SIZE, (opt->pool_sz / 1.5) / t->nb_workers);
if (opt->prod_type == EVT_PROD_TYPE_SYNT ||
opt->prod_type == EVT_PROD_TYPE_EVENT_TIMER_ADPTR) {
t->pool = rte_mempool_create(test->name, /* mempool name */
opt->pool_sz, /* number of elements*/
sizeof(struct perf_elt), /* element size*/
- 512, /* cache size*/
+ cache_sz, /* cache size*/
0, NULL, NULL,
perf_elt_init, /* obj constructor */
NULL, opt->socket_id, 0); /* flags */
} else if (opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR &&
- opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
+ opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
t->pool = rte_mempool_create(test->name, /* mempool name */
opt->pool_sz, /* number of elements*/
sizeof(struct perf_elt) + modex_test_case.result_len,
/* element size*/
- 512, /* cache size*/
+ cache_sz, /* cache size*/
0, NULL, NULL,
NULL, /* obj constructor */
NULL, opt->socket_id, 0); /* flags */
} else {
t->pool = rte_pktmbuf_pool_create(test->name, /* mempool name */
opt->pool_sz, /* number of elements*/
- 512, /* cache size*/
+ cache_sz, /* cache size*/
0,
RTE_MBUF_DEFAULT_BUF_SIZE,
opt->socket_id); /* flags */
-
}
if (t->pool == NULL) {
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst
2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
2023-04-19 20:01 ` [PATCH 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
@ 2023-04-25 19:51 ` pbhagavatula
2023-04-25 19:51 ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
` (3 more replies)
2 siblings, 4 replies; 12+ messages in thread
From: pbhagavatula @ 2023-04-25 19:51 UTC (permalink / raw)
To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
Satha Rao, Pavan Nikhilesh, Shijith Thotton
Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use LMTST when all events in the burst are enqueue with
rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
with the `rte_event_enqueue_new_burst` API.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
v2 Changes:
- Fix spell check.
drivers/common/cnxk/hw/sso.h | 1 +
drivers/common/cnxk/roc_sso.c | 10 +-
drivers/common/cnxk/roc_sso.h | 3 +
drivers/event/cnxk/cn10k_eventdev.c | 9 +-
drivers/event/cnxk/cn10k_eventdev.h | 6 +-
drivers/event/cnxk/cn10k_worker.c | 304 +++++++++++++++++++++++++++-
drivers/event/cnxk/cn9k_eventdev.c | 2 +-
drivers/event/cnxk/cnxk_eventdev.c | 15 +-
drivers/event/cnxk/cnxk_eventdev.h | 4 +-
9 files changed, 338 insertions(+), 16 deletions(-)
diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
index 25deaa4c14..09b8d4955f 100644
--- a/drivers/common/cnxk/hw/sso.h
+++ b/drivers/common/cnxk/hw/sso.h
@@ -157,6 +157,7 @@
#define SSO_LF_GGRP_AQ_CNT (0x1c0ull)
#define SSO_LF_GGRP_AQ_THR (0x1e0ull)
#define SSO_LF_GGRP_MISC_CNT (0x200ull)
+#define SSO_LF_GGRP_OP_AW_LMTST (0x400ull)
#define SSO_AF_IAQ_FREE_CNT_MASK 0x3FFFull
#define SSO_AF_IAQ_RSVD_FREE_MASK 0x3FFFull
diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
index 4a6a5080f7..99a55e49b0 100644
--- a/drivers/common/cnxk/roc_sso.c
+++ b/drivers/common/cnxk/roc_sso.c
@@ -6,6 +6,7 @@
#include "roc_priv.h"
#define SSO_XAQ_CACHE_CNT (0x7)
+#define SSO_XAQ_SLACK (16)
/* Private functions. */
int
@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
xaq->nb_xae = nb_xae;
- /* Taken from HRM 14.3.3(4) */
+ /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork engine
+ * is inactive and it might prefetch an additional 0x3 buffers due to
+ * pipelining.
+ */
xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
+ xaq->nb_xaq += SSO_XAQ_SLACK;
xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
if (xaq->mem == NULL) {
@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
* There should be a minimum headroom of 7 XAQs per HWGRP for SSO
* to request XAQ to cache them even before enqueue is called.
*/
- xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
+ xaq->xaq_lmt =
+ xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;
return 0;
npa_fill_fail:
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index e67797b046..a2bb6fcb22 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -7,6 +7,9 @@
#include "hw/ssow.h"
+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
+#define ROC_SSO_XAE_PER_XAQ 352
+
struct roc_sso_hwgrp_qos {
uint16_t hwgrp;
uint8_t xaq_prcnt;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 071ea5a212..855c92da83 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
uint64_t val;
ws->grp_base = grp_base;
- ws->fc_mem = (uint64_t *)dev->fc_iova;
+ ws->fc_mem = (int64_t *)dev->fc_iova;
ws->xaq_lmt = dev->xaq_lmt;
+ ws->fc_cache_space = dev->fc_cache_space;
+ ws->aw_lmt = ws->lmt_base;
/* Set get_work timeout for HWS */
val = NSEC2USEC(dev->deq_tmo_ns);
@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
cnxk_sso_info_get(dev, dev_info);
+ dev_info->max_event_port_enqueue_depth = UINT32_MAX;
}
static int
@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
int rc;
- rc = cnxk_sso_dev_validate(event_dev);
+ rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
if (rc < 0) {
plt_err("Invalid event device configuration");
return -EINVAL;
@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
for (i = 0; i < dev->nb_event_ports; i++) {
struct cn10k_sso_hws *ws = event_dev->data->ports[i];
ws->xaq_lmt = dev->xaq_lmt;
- ws->fc_mem = (uint64_t *)dev->fc_iova;
+ ws->fc_mem = (int64_t *)dev->fc_iova;
ws->tstamp = dev->tstamp;
if (lookup_mem)
ws->lookup_mem = lookup_mem;
diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
index aaa01d1ec1..29567728cd 100644
--- a/drivers/event/cnxk/cn10k_eventdev.h
+++ b/drivers/event/cnxk/cn10k_eventdev.h
@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
struct cnxk_timesync_info **tstamp;
uint64_t meta_aura;
/* Add Work Fastpath data */
- uint64_t xaq_lmt __rte_cache_aligned;
- uint64_t *fc_mem;
+ int64_t *fc_mem __rte_cache_aligned;
+ int64_t *fc_cache_space;
+ uintptr_t aw_lmt;
uintptr_t grp_base;
+ int32_t xaq_lmt;
/* Tx Fastpath data */
uintptr_t lmt_base __rte_cache_aligned;
uint64_t lso_tun_fmt;
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 562d2fca13..1028a12c64 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
cn10k_sso_hws_fwd_group(ws, ev, grp);
}
+static inline int32_t
+sso_read_xaq_space(struct cn10k_sso_hws *ws)
+{
+ return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
+ ROC_SSO_XAE_PER_XAQ;
+}
+
+static inline void
+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
+{
+ int64_t cached, refill;
+
+retry:
+ while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
+ ;
+
+ cached = __atomic_sub_fetch(ws->fc_cache_space, req, __ATOMIC_ACQUIRE);
+ /* Check if there is enough space, else update and retry. */
+ if (cached < 0) {
+ /* Check if we have space else retry. */
+ do {
+ refill = sso_read_xaq_space(ws);
+ } while (refill <= 0);
+ __atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
+ 0, __ATOMIC_RELEASE,
+ __ATOMIC_RELAXED);
+ goto retry;
+ }
+}
+
uint16_t __rte_hot
cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
{
@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
return 1;
}
+#define VECTOR_SIZE_BITS 0xFFFFFFFFFFF80000ULL
+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
+
+static uint64_t
+vector_size_partial_mask(uint16_t off, uint16_t cnt)
+{
+ return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
+ ((uint64_t)(cnt - 1) << off);
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
+ const struct rte_event ev[], uint16_t n)
+{
+ uint16_t lines, partial_line, burst, left;
+ uint64_t wdata[2], pa[2] = {0};
+ uintptr_t lmt_addr;
+ uint16_t sz0, sz1;
+ uint16_t lmt_id;
+
+ sz0 = sz1 = 0;
+ lmt_addr = ws->lmt_base;
+ ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+ left = n;
+again:
+ burst = RTE_MIN(
+ BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
+ left);
+
+ /* Set wdata */
+ lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
+ partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
+ wdata[0] = wdata[1] = 0;
+ if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
+ wdata[0] = lmt_id;
+ wdata[0] |= 15ULL << 12;
+ wdata[0] |= VECTOR_SIZE_BITS;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+ sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+
+ wdata[1] = lmt_id + 16;
+ pa[1] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+
+ lines -= 17;
+ wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+ (uint64_t)(lines << 12);
+ wdata[1] |= partial_line ?
+ vector_size_partial_mask(
+ VECTOR_GET_LINE_OFFSET(lines),
+ partial_line) :
+ VECTOR_SIZE_BITS;
+ sz1 = burst - sz0;
+ partial_line = 0;
+ } else if (lines) {
+ /* We need to handle two cases here:
+ * 1. Partial line spill over to wdata[1] i.e. lines == 16
+ * 2. Partial line with spill lines < 16.
+ */
+ wdata[0] = lmt_id;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+ sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+ if (lines == 16) {
+ wdata[0] |= 15ULL << 12;
+ wdata[0] |= VECTOR_SIZE_BITS;
+ if (partial_line) {
+ wdata[1] = lmt_id + 16;
+ pa[1] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ ((partial_line - 1) << 4);
+ }
+ } else {
+ lines -= 1;
+ wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+ (uint64_t)(lines << 12);
+ wdata[0] |=
+ partial_line ?
+ vector_size_partial_mask(
+ VECTOR_GET_LINE_OFFSET(lines),
+ partial_line) :
+ VECTOR_SIZE_BITS;
+ sz0 += partial_line;
+ }
+ sz1 = burst - sz0;
+ partial_line = 0;
+ }
+
+ /* Only partial lines */
+ if (partial_line) {
+ wdata[0] = lmt_id;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ ((partial_line - 1) << 4);
+ sz0 = partial_line;
+ sz1 = burst - sz0;
+ }
+
+#if defined(RTE_ARCH_ARM64)
+ uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
+ uint64x2_t tt_mask = {0x300000000ULL, 0};
+ uint16_t parts;
+
+ while (burst) {
+ parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
+ burst -= parts;
+ /* Lets try to fill at least one line per burst. */
+ switch (parts) {
+ case 8: {
+ uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
+
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+ aw_mask);
+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+ aw_mask);
+ aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
+ aw_mask);
+ aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
+ aw_mask);
+ aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
+ aw_mask);
+ aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+ aw2);
+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+ aw3);
+ aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
+ aw4);
+ aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
+ aw5);
+ aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
+ aw6);
+ aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
+ aw7);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
+ } break;
+ case 4: {
+ uint64x2_t aw0, aw1, aw2, aw3;
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+ aw_mask);
+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+ aw2);
+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+ aw3);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
+ } break;
+ case 2: {
+ uint64x2_t aw0, aw1;
+
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
+ } break;
+ case 1: {
+ __uint128_t aw0;
+
+ aw0 = ev[0].u64;
+ aw0 <<= 64;
+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+ aw0 |= (uint64_t)ev[0].sched_type << 32;
+
+ *((__uint128_t *)lmt_addr) = aw0;
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+ } break;
+ }
+ ev += parts;
+ }
+#else
+ uint16_t i;
+
+ for (i = 0; i < burst; i++) {
+ __uint128_t aw0;
+
+ aw0 = ev[0].u64;
+ aw0 <<= 64;
+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+ aw0 |= (uint64_t)ev[0].sched_type << 32;
+ *((__uint128_t *)lmt_addr) = aw0;
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+ }
+#endif
+
+ /* wdata[0] will be always valid */
+ sso_lmt_aw_wait_fc(ws, sz0);
+ roc_lmt_submit_steorl(wdata[0], pa[0]);
+ if (wdata[1]) {
+ sso_lmt_aw_wait_fc(ws, sz1);
+ roc_lmt_submit_steorl(wdata[1], pa[1]);
+ }
+
+ left -= (sz0 + sz1);
+ if (left)
+ goto again;
+
+ return n;
+}
+
uint16_t __rte_hot
cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
uint16_t nb_events)
@@ -115,13 +392,32 @@ uint16_t __rte_hot
cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
uint16_t nb_events)
{
+ uint16_t idx = 0, done = 0, rc = 0;
struct cn10k_sso_hws *ws = port;
- uint16_t i, rc = 1;
+ uint8_t queue_id;
+ int32_t space;
+
+ /* Do a common back-pressure check and return */
+ space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
+ if (space <= 0)
+ return 0;
+ nb_events = space < nb_events ? space : nb_events;
+
+ do {
+ queue_id = ev[idx].queue_id;
+ for (idx = idx + 1; idx < nb_events; idx++)
+ if (queue_id != ev[idx].queue_id)
+ break;
+
+ rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
+ idx - done);
+ if (rc != (idx - done))
+ return rc + done;
+ done += rc;
- for (i = 0; i < nb_events && rc; i++)
- rc = cn10k_sso_hws_new_event(ws, &ev[i]);
+ } while (done < nb_events);
- return nb_events;
+ return done;
}
uint16_t __rte_hot
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7e8339bd3a..e59e537311 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
int rc;
- rc = cnxk_sso_dev_validate(event_dev);
+ rc = cnxk_sso_dev_validate(event_dev, 1, 1);
if (rc < 0) {
plt_err("Invalid event device configuration");
return -EINVAL;
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index cb9ba5d353..99f9cdcd0d 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
}
int
-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
+ uint32_t enq_depth)
{
struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
return -EINVAL;
}
- if (conf->nb_event_port_dequeue_depth > 1) {
+ if (conf->nb_event_port_dequeue_depth > deq_depth) {
plt_err("Unsupported event port deq depth requested");
return -EINVAL;
}
- if (conf->nb_event_port_enqueue_depth > 1) {
+ if (conf->nb_event_port_enqueue_depth > enq_depth) {
plt_err("Unsupported event port enq depth requested");
return -EINVAL;
}
@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
}
dev = cnxk_sso_pmd_priv(event_dev);
+ dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
+ PLT_CACHE_LINE_SIZE);
+ if (dev->fc_cache_space == NULL) {
+ plt_memzone_free(mz);
+ plt_err("Failed to reserve memory for XAQ fc cache");
+ return -ENOMEM;
+ }
+
pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
dev->sso.pci_dev = pci_dev;
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index c7cbd722ab..a2f30bfe5f 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
uint32_t max_dequeue_timeout_ns;
int32_t max_num_events;
uint64_t xaq_lmt;
+ int64_t *fc_cache_space;
rte_iova_t fc_iova;
uint64_t rx_offloads;
uint64_t tx_offloads;
@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
int cnxk_sso_remove(struct rte_pci_device *pci_dev);
void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
struct rte_event_dev_info *dev_info);
-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
+ uint32_t deq_depth, uint32_t enq_depth);
int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
cnxk_sso_init_hws_mem_t init_hws_mem,
cnxk_sso_hws_setup_t hws_setup);
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
@ 2023-04-25 19:51 ` pbhagavatula
2023-05-18 15:47 ` [EXT] " Shijith Thotton
2023-04-25 19:51 ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: pbhagavatula @ 2023-04-25 19:51 UTC (permalink / raw)
To: jerinj; +Cc: dev, Pavan Nikhilesh
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use the `rte_event_enqueue_new_burst` routine to enqueue events
with rte_event::op as RTE_EVENT_OP_NEW. This allows PMDs to use
optimized enqueue routines.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
app/test-eventdev/evt_options.c | 2 +-
app/test-eventdev/test_perf_common.c | 58 +++++++++++++++++-----------
2 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/app/test-eventdev/evt_options.c b/app/test-eventdev/evt_options.c
index b175c067cd..03fb3bfce0 100644
--- a/app/test-eventdev/evt_options.c
+++ b/app/test-eventdev/evt_options.c
@@ -27,7 +27,7 @@ evt_options_default(struct evt_options *opt)
opt->nb_flows = 1024;
opt->socket_id = SOCKET_ID_ANY;
opt->pool_sz = 16 * 1024;
- opt->prod_enq_burst_sz = 1;
+ opt->prod_enq_burst_sz = 0;
opt->wkr_deq_dep = 16;
opt->nb_pkts = (1ULL << 26); /* do ~64M packets */
opt->nb_timers = 1E8;
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index fd434666cb..68af3cb346 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -131,8 +131,10 @@ perf_producer(void *arg)
uint32_t flow_counter = 0;
uint64_t count = 0;
struct perf_elt *m[BURST_SIZE + 1] = {NULL};
+ uint8_t enable_fwd_latency;
struct rte_event ev;
+ enable_fwd_latency = opt->fwd_latency;
if (opt->verbose_level > 1)
printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
rte_lcore_id(), dev_id, port, p->queue_id);
@@ -151,13 +153,16 @@ perf_producer(void *arg)
for (i = 0; i < BURST_SIZE; i++) {
ev.flow_id = flow_counter++ % nb_flows;
ev.event_ptr = m[i];
- m[i]->timestamp = rte_get_timer_cycles();
- while (rte_event_enqueue_burst(dev_id,
- port, &ev, 1) != 1) {
+ if (enable_fwd_latency)
+ m[i]->timestamp = rte_get_timer_cycles();
+ while (rte_event_enqueue_new_burst(dev_id, port, &ev,
+ 1) != 1) {
if (t->done)
break;
rte_pause();
- m[i]->timestamp = rte_get_timer_cycles();
+ if (enable_fwd_latency)
+ m[i]->timestamp =
+ rte_get_timer_cycles();
}
}
count += BURST_SIZE;
@@ -171,7 +176,6 @@ perf_producer_burst(void *arg)
{
uint32_t i;
uint64_t timestamp;
- struct rte_event_dev_info dev_info;
struct prod_data *p = arg;
struct test_perf *t = p->t;
struct evt_options *opt = t->opt;
@@ -183,15 +187,13 @@ perf_producer_burst(void *arg)
uint32_t flow_counter = 0;
uint16_t enq = 0;
uint64_t count = 0;
- struct perf_elt *m[MAX_PROD_ENQ_BURST_SIZE + 1];
- struct rte_event ev[MAX_PROD_ENQ_BURST_SIZE + 1];
+ struct perf_elt *m[opt->prod_enq_burst_sz + 1];
+ struct rte_event ev[opt->prod_enq_burst_sz + 1];
uint32_t burst_size = opt->prod_enq_burst_sz;
+ uint8_t enable_fwd_latency;
- memset(m, 0, sizeof(*m) * (MAX_PROD_ENQ_BURST_SIZE + 1));
- rte_event_dev_info_get(dev_id, &dev_info);
- if (dev_info.max_event_port_enqueue_depth < burst_size)
- burst_size = dev_info.max_event_port_enqueue_depth;
-
+ enable_fwd_latency = opt->fwd_latency;
+ memset(m, 0, sizeof(*m) * (opt->prod_enq_burst_sz + 1));
if (opt->verbose_level > 1)
printf("%s(): lcore %d dev_id %d port=%d queue %d\n", __func__,
rte_lcore_id(), dev_id, port, p->queue_id);
@@ -212,19 +214,21 @@ perf_producer_burst(void *arg)
for (i = 0; i < burst_size; i++) {
ev[i].flow_id = flow_counter++ % nb_flows;
ev[i].event_ptr = m[i];
- m[i]->timestamp = timestamp;
+ if (enable_fwd_latency)
+ m[i]->timestamp = timestamp;
}
- enq = rte_event_enqueue_burst(dev_id, port, ev, burst_size);
+ enq = rte_event_enqueue_new_burst(dev_id, port, ev, burst_size);
while (enq < burst_size) {
- enq += rte_event_enqueue_burst(dev_id, port,
- ev + enq,
- burst_size - enq);
+ enq += rte_event_enqueue_new_burst(
+ dev_id, port, ev + enq, burst_size - enq);
if (t->done)
break;
rte_pause();
- timestamp = rte_get_timer_cycles();
- for (i = enq; i < burst_size; i++)
- m[i]->timestamp = timestamp;
+ if (enable_fwd_latency) {
+ timestamp = rte_get_timer_cycles();
+ for (i = enq; i < burst_size; i++)
+ m[i]->timestamp = timestamp;
+ }
}
count += burst_size;
}
@@ -799,9 +803,19 @@ perf_event_crypto_producer_burst(void *arg)
static int
perf_producer_wrapper(void *arg)
{
+ struct rte_event_dev_info dev_info;
struct prod_data *p = arg;
struct test_perf *t = p->t;
- bool burst = evt_has_burst_mode(p->dev_id);
+
+ rte_event_dev_info_get(p->dev_id, &dev_info);
+ if (!t->opt->prod_enq_burst_sz) {
+ t->opt->prod_enq_burst_sz = MAX_PROD_ENQ_BURST_SIZE;
+ if (dev_info.max_event_port_enqueue_depth > 0 &&
+ (uint32_t)dev_info.max_event_port_enqueue_depth <
+ t->opt->prod_enq_burst_sz)
+ t->opt->prod_enq_burst_sz =
+ dev_info.max_event_port_enqueue_depth;
+ }
/* In case of synthetic producer, launch perf_producer or
* perf_producer_burst depending on producer enqueue burst size
@@ -811,7 +825,7 @@ perf_producer_wrapper(void *arg)
return perf_producer(arg);
else if (t->opt->prod_type == EVT_PROD_TYPE_SYNT &&
t->opt->prod_enq_burst_sz > 1) {
- if (!burst)
+ if (dev_info.max_event_port_enqueue_depth == 1)
evt_err("This event device does not support burst mode");
else
return perf_producer_burst(arg);
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-25 19:51 ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
@ 2023-04-25 19:51 ` pbhagavatula
2023-05-18 15:47 ` [EXT] " Shijith Thotton
2023-05-18 15:42 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
2023-05-22 11:56 ` [PATCH v3] " pbhagavatula
3 siblings, 1 reply; 12+ messages in thread
From: pbhagavatula @ 2023-04-25 19:51 UTC (permalink / raw)
To: jerinj; +Cc: dev, Pavan Nikhilesh
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Prevent mempool exhaustion due to elements being stuck in lcore
local caches.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
app/test-eventdev/test_perf_common.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index 68af3cb346..5e0255cfeb 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -1859,34 +1859,35 @@ int
perf_mempool_setup(struct evt_test *test, struct evt_options *opt)
{
struct test_perf *t = evt_test_priv(test);
+ unsigned int cache_sz;
+ cache_sz = RTE_MIN(RTE_MEMPOOL_CACHE_MAX_SIZE, (opt->pool_sz / 1.5) / t->nb_workers);
if (opt->prod_type == EVT_PROD_TYPE_SYNT ||
opt->prod_type == EVT_PROD_TYPE_EVENT_TIMER_ADPTR) {
t->pool = rte_mempool_create(test->name, /* mempool name */
opt->pool_sz, /* number of elements*/
sizeof(struct perf_elt), /* element size*/
- 512, /* cache size*/
+ cache_sz, /* cache size*/
0, NULL, NULL,
perf_elt_init, /* obj constructor */
NULL, opt->socket_id, 0); /* flags */
} else if (opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR &&
- opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
+ opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
t->pool = rte_mempool_create(test->name, /* mempool name */
opt->pool_sz, /* number of elements*/
sizeof(struct perf_elt) + modex_test_case.result_len,
/* element size*/
- 512, /* cache size*/
+ cache_sz, /* cache size*/
0, NULL, NULL,
NULL, /* obj constructor */
NULL, opt->socket_id, 0); /* flags */
} else {
t->pool = rte_pktmbuf_pool_create(test->name, /* mempool name */
opt->pool_sz, /* number of elements*/
- 512, /* cache size*/
+ cache_sz, /* cache size*/
0,
RTE_MBUF_DEFAULT_BUF_SIZE,
opt->socket_id); /* flags */
-
}
if (t->pool == NULL) {
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-25 19:51 ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
2023-04-25 19:51 ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
@ 2023-05-18 15:42 ` Shijith Thotton
2023-05-22 7:23 ` Jerin Jacob
2023-05-22 11:56 ` [PATCH v3] " pbhagavatula
3 siblings, 1 reply; 12+ messages in thread
From: Shijith Thotton @ 2023-05-18 15:42 UTC (permalink / raw)
To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
Sunil Kumar Kori, Satha Koteswara Rao Kottidi,
Pavan Nikhilesh Bhagavatula
Cc: dev
[-- Attachment #1: Type: text/plain, Size: 18882 bytes --]
>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Use LMTST when all events in the burst are enqueue with
>rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
>with the `rte_event_enqueue_new_burst` API.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
>---
>v2 Changes:
>- Fix spell check.
>
> drivers/common/cnxk/hw/sso.h | 1 +
> drivers/common/cnxk/roc_sso.c | 10 +-
> drivers/common/cnxk/roc_sso.h | 3 +
> drivers/event/cnxk/cn10k_eventdev.c | 9 +-
> drivers/event/cnxk/cn10k_eventdev.h | 6 +-
> drivers/event/cnxk/cn10k_worker.c | 304 +++++++++++++++++++++++++++-
> drivers/event/cnxk/cn9k_eventdev.c | 2 +-
> drivers/event/cnxk/cnxk_eventdev.c | 15 +-
> drivers/event/cnxk/cnxk_eventdev.h | 4 +-
> 9 files changed, 338 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
>index 25deaa4c14..09b8d4955f 100644
>--- a/drivers/common/cnxk/hw/sso.h
>+++ b/drivers/common/cnxk/hw/sso.h
>@@ -157,6 +157,7 @@
> #define SSO_LF_GGRP_AQ_CNT (0x1c0ull)
> #define SSO_LF_GGRP_AQ_THR (0x1e0ull)
> #define SSO_LF_GGRP_MISC_CNT (0x200ull)
>+#define SSO_LF_GGRP_OP_AW_LMTST (0x400ull)
>
> #define SSO_AF_IAQ_FREE_CNT_MASK 0x3FFFull
> #define SSO_AF_IAQ_RSVD_FREE_MASK 0x3FFFull
>diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
>index 4a6a5080f7..99a55e49b0 100644
>--- a/drivers/common/cnxk/roc_sso.c
>+++ b/drivers/common/cnxk/roc_sso.c
>@@ -6,6 +6,7 @@
> #include "roc_priv.h"
>
> #define SSO_XAQ_CACHE_CNT (0x7)
>+#define SSO_XAQ_SLACK (16)
>
> /* Private functions. */
> int
>@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
>roc_sso_xaq_data *xaq,
>
> xaq->nb_xae = nb_xae;
>
>- /* Taken from HRM 14.3.3(4) */
>+ /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork
>engine
>+ * is inactive and it might prefetch an additional 0x3 buffers due to
>+ * pipelining.
>+ */
> xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
> xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq-
>>nb_xaq);
>+ xaq->nb_xaq += SSO_XAQ_SLACK;
>
> xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
> if (xaq->mem == NULL) {
>@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
>roc_sso_xaq_data *xaq,
> * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
> * to request XAQ to cache them even before enqueue is called.
> */
>- xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
>+ xaq->xaq_lmt =
>+ xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) -
>SSO_XAQ_SLACK;
>
> return 0;
> npa_fill_fail:
>diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
>index e67797b046..a2bb6fcb22 100644
>--- a/drivers/common/cnxk/roc_sso.h
>+++ b/drivers/common/cnxk/roc_sso.h
>@@ -7,6 +7,9 @@
>
> #include "hw/ssow.h"
>
>+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
>+#define ROC_SSO_XAE_PER_XAQ 352
>+
> struct roc_sso_hwgrp_qos {
> uint16_t hwgrp;
> uint8_t xaq_prcnt;
>diff --git a/drivers/event/cnxk/cn10k_eventdev.c
>b/drivers/event/cnxk/cn10k_eventdev.c
>index 071ea5a212..855c92da83 100644
>--- a/drivers/event/cnxk/cn10k_eventdev.c
>+++ b/drivers/event/cnxk/cn10k_eventdev.c
>@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t
>grp_base)
> uint64_t val;
>
> ws->grp_base = grp_base;
>- ws->fc_mem = (uint64_t *)dev->fc_iova;
>+ ws->fc_mem = (int64_t *)dev->fc_iova;
> ws->xaq_lmt = dev->xaq_lmt;
>+ ws->fc_cache_space = dev->fc_cache_space;
>+ ws->aw_lmt = ws->lmt_base;
>
> /* Set get_work timeout for HWS */
> val = NSEC2USEC(dev->deq_tmo_ns);
>@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
>
> dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
> cnxk_sso_info_get(dev, dev_info);
>+ dev_info->max_event_port_enqueue_depth = UINT32_MAX;
> }
>
> static int
>@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev
>*event_dev)
> struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> int rc;
>
>- rc = cnxk_sso_dev_validate(event_dev);
>+ rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
> if (rc < 0) {
> plt_err("Invalid event device configuration");
> return -EINVAL;
>@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev
>*event_dev, void *lookup_mem)
> for (i = 0; i < dev->nb_event_ports; i++) {
> struct cn10k_sso_hws *ws = event_dev->data->ports[i];
> ws->xaq_lmt = dev->xaq_lmt;
>- ws->fc_mem = (uint64_t *)dev->fc_iova;
>+ ws->fc_mem = (int64_t *)dev->fc_iova;
> ws->tstamp = dev->tstamp;
> if (lookup_mem)
> ws->lookup_mem = lookup_mem;
>diff --git a/drivers/event/cnxk/cn10k_eventdev.h
>b/drivers/event/cnxk/cn10k_eventdev.h
>index aaa01d1ec1..29567728cd 100644
>--- a/drivers/event/cnxk/cn10k_eventdev.h
>+++ b/drivers/event/cnxk/cn10k_eventdev.h
>@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
> struct cnxk_timesync_info **tstamp;
> uint64_t meta_aura;
> /* Add Work Fastpath data */
>- uint64_t xaq_lmt __rte_cache_aligned;
>- uint64_t *fc_mem;
>+ int64_t *fc_mem __rte_cache_aligned;
>+ int64_t *fc_cache_space;
>+ uintptr_t aw_lmt;
> uintptr_t grp_base;
>+ int32_t xaq_lmt;
> /* Tx Fastpath data */
> uintptr_t lmt_base __rte_cache_aligned;
> uint64_t lso_tun_fmt;
>diff --git a/drivers/event/cnxk/cn10k_worker.c
>b/drivers/event/cnxk/cn10k_worker.c
>index 562d2fca13..1028a12c64 100644
>--- a/drivers/event/cnxk/cn10k_worker.c
>+++ b/drivers/event/cnxk/cn10k_worker.c
>@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
> cn10k_sso_hws_fwd_group(ws, ev, grp);
> }
>
>+static inline int32_t
>+sso_read_xaq_space(struct cn10k_sso_hws *ws)
>+{
>+ return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem,
>__ATOMIC_RELAXED)) *
>+ ROC_SSO_XAE_PER_XAQ;
>+}
>+
>+static inline void
>+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
>+{
>+ int64_t cached, refill;
>+
>+retry:
>+ while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
>+ ;
>+
>+ cached = __atomic_sub_fetch(ws->fc_cache_space, req,
>__ATOMIC_ACQUIRE);
>+ /* Check if there is enough space, else update and retry. */
>+ if (cached < 0) {
>+ /* Check if we have space else retry. */
>+ do {
>+ refill = sso_read_xaq_space(ws);
>+ } while (refill <= 0);
>+ __atomic_compare_exchange(ws->fc_cache_space, &cached,
>&refill,
>+ 0, __ATOMIC_RELEASE,
>+ __ATOMIC_RELAXED);
>+ goto retry;
>+ }
>+}
>+
> uint16_t __rte_hot
> cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
> {
>@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event
>*ev)
> return 1;
> }
>
>+#define VECTOR_SIZE_BITS 0xFFFFFFFFFFF80000ULL
>+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
>+
>+static uint64_t
>+vector_size_partial_mask(uint16_t off, uint16_t cnt)
>+{
>+ return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
>+ ((uint64_t)(cnt - 1) << off);
>+}
>+
>+static __rte_always_inline uint16_t
>+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t
>queue_id,
>+ const struct rte_event ev[], uint16_t n)
>+{
>+ uint16_t lines, partial_line, burst, left;
>+ uint64_t wdata[2], pa[2] = {0};
>+ uintptr_t lmt_addr;
>+ uint16_t sz0, sz1;
>+ uint16_t lmt_id;
>+
>+ sz0 = sz1 = 0;
>+ lmt_addr = ws->lmt_base;
>+ ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
>+
>+ left = n;
>+again:
>+ burst = RTE_MIN(
>+ BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 +
>ROC_LMT_LINES_PER_CORE_LOG2),
>+ left);
>+
>+ /* Set wdata */
>+ lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
>+ partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
>+ wdata[0] = wdata[1] = 0;
>+ if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
>+ wdata[0] = lmt_id;
>+ wdata[0] |= 15ULL << 12;
>+ wdata[0] |= VECTOR_SIZE_BITS;
>+ pa[0] = (ws->grp_base + (queue_id << 12) +
>+ SSO_LF_GGRP_OP_AW_LMTST) |
>+ (0x7 << 4);
>+ sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
>+
>+ wdata[1] = lmt_id + 16;
>+ pa[1] = (ws->grp_base + (queue_id << 12) +
>+ SSO_LF_GGRP_OP_AW_LMTST) |
>+ (0x7 << 4);
>+
>+ lines -= 17;
>+ wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
>+ (uint64_t)(lines << 12);
>+ wdata[1] |= partial_line ?
>+ vector_size_partial_mask(
>+ VECTOR_GET_LINE_OFFSET(lines),
>+ partial_line) :
>+ VECTOR_SIZE_BITS;
>+ sz1 = burst - sz0;
>+ partial_line = 0;
>+ } else if (lines) {
>+ /* We need to handle two cases here:
>+ * 1. Partial line spill over to wdata[1] i.e. lines == 16
>+ * 2. Partial line with spill lines < 16.
>+ */
>+ wdata[0] = lmt_id;
>+ pa[0] = (ws->grp_base + (queue_id << 12) +
>+ SSO_LF_GGRP_OP_AW_LMTST) |
>+ (0x7 << 4);
>+ sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
>+ if (lines == 16) {
>+ wdata[0] |= 15ULL << 12;
>+ wdata[0] |= VECTOR_SIZE_BITS;
>+ if (partial_line) {
>+ wdata[1] = lmt_id + 16;
>+ pa[1] = (ws->grp_base + (queue_id << 12) +
>+ SSO_LF_GGRP_OP_AW_LMTST) |
>+ ((partial_line - 1) << 4);
>+ }
>+ } else {
>+ lines -= 1;
>+ wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
>+ (uint64_t)(lines << 12);
>+ wdata[0] |=
>+ partial_line ?
>+ vector_size_partial_mask(
>+ VECTOR_GET_LINE_OFFSET(lines),
>+ partial_line) :
>+ VECTOR_SIZE_BITS;
>+ sz0 += partial_line;
>+ }
>+ sz1 = burst - sz0;
>+ partial_line = 0;
>+ }
>+
>+ /* Only partial lines */
>+ if (partial_line) {
>+ wdata[0] = lmt_id;
>+ pa[0] = (ws->grp_base + (queue_id << 12) +
>+ SSO_LF_GGRP_OP_AW_LMTST) |
>+ ((partial_line - 1) << 4);
>+ sz0 = partial_line;
>+ sz1 = burst - sz0;
>+ }
>+
>+#if defined(RTE_ARCH_ARM64)
>+ uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
>+ uint64x2_t tt_mask = {0x300000000ULL, 0};
>+ uint16_t parts;
>+
>+ while (burst) {
>+ parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
>+ burst -= parts;
>+ /* Lets try to fill at least one line per burst. */
>+ switch (parts) {
>+ case 8: {
>+ uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
>+
>+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
>+ aw_mask);
>+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
>+ aw_mask);
>+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
>+ aw_mask);
>+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
>+ aw_mask);
>+ aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
>+ aw_mask);
>+ aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
>+ aw_mask);
>+ aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
>+ aw_mask);
>+ aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
>+ aw_mask);
>+
>+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
>tt_mask),
>+ aw0);
>+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
>tt_mask),
>+ aw1);
>+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
>tt_mask),
>+ aw2);
>+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
>tt_mask),
>+ aw3);
>+ aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6),
>tt_mask),
>+ aw4);
>+ aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6),
>tt_mask),
>+ aw5);
>+ aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6),
>tt_mask),
>+ aw6);
>+ aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6),
>tt_mask),
>+ aw7);
>+
>+ vst1q_u64((void *)lmt_addr, aw0);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
>+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
>+ } break;
>+ case 4: {
>+ uint64x2_t aw0, aw1, aw2, aw3;
>+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
>+ aw_mask);
>+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
>+ aw_mask);
>+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
>+ aw_mask);
>+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
>+ aw_mask);
>+
>+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
>tt_mask),
>+ aw0);
>+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
>tt_mask),
>+ aw1);
>+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
>tt_mask),
>+ aw2);
>+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
>tt_mask),
>+ aw3);
>+
>+ vst1q_u64((void *)lmt_addr, aw0);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
>+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
>+ } break;
>+ case 2: {
>+ uint64x2_t aw0, aw1;
>+
>+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
>+ aw_mask);
>+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
>+ aw_mask);
>+
>+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
>tt_mask),
>+ aw0);
>+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
>tt_mask),
>+ aw1);
>+
>+ vst1q_u64((void *)lmt_addr, aw0);
>+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
>+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
>+ } break;
>+ case 1: {
>+ __uint128_t aw0;
>+
>+ aw0 = ev[0].u64;
>+ aw0 <<= 64;
>+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
>+ aw0 |= (uint64_t)ev[0].sched_type << 32;
>+
>+ *((__uint128_t *)lmt_addr) = aw0;
>+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
>+ } break;
>+ }
>+ ev += parts;
>+ }
>+#else
>+ uint16_t i;
>+
>+ for (i = 0; i < burst; i++) {
>+ __uint128_t aw0;
>+
>+ aw0 = ev[0].u64;
>+ aw0 <<= 64;
>+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
>+ aw0 |= (uint64_t)ev[0].sched_type << 32;
>+ *((__uint128_t *)lmt_addr) = aw0;
>+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
>+ }
>+#endif
>+
>+ /* wdata[0] will be always valid */
>+ sso_lmt_aw_wait_fc(ws, sz0);
>+ roc_lmt_submit_steorl(wdata[0], pa[0]);
>+ if (wdata[1]) {
>+ sso_lmt_aw_wait_fc(ws, sz1);
>+ roc_lmt_submit_steorl(wdata[1], pa[1]);
>+ }
>+
>+ left -= (sz0 + sz1);
>+ if (left)
>+ goto again;
>+
>+ return n;
>+}
>+
> uint16_t __rte_hot
> cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
> uint16_t nb_events)
>@@ -115,13 +392,32 @@ uint16_t __rte_hot
> cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
> uint16_t nb_events)
> {
>+ uint16_t idx = 0, done = 0, rc = 0;
> struct cn10k_sso_hws *ws = port;
>- uint16_t i, rc = 1;
>+ uint8_t queue_id;
>+ int32_t space;
>+
>+ /* Do a common back-pressure check and return */
>+ space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
>+ if (space <= 0)
>+ return 0;
>+ nb_events = space < nb_events ? space : nb_events;
>+
>+ do {
>+ queue_id = ev[idx].queue_id;
>+ for (idx = idx + 1; idx < nb_events; idx++)
>+ if (queue_id != ev[idx].queue_id)
>+ break;
>+
>+ rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
>+ idx - done);
>+ if (rc != (idx - done))
>+ return rc + done;
>+ done += rc;
>
>- for (i = 0; i < nb_events && rc; i++)
>- rc = cn10k_sso_hws_new_event(ws, &ev[i]);
>+ } while (done < nb_events);
>
>- return nb_events;
>+ return done;
> }
>
> uint16_t __rte_hot
>diff --git a/drivers/event/cnxk/cn9k_eventdev.c
>b/drivers/event/cnxk/cn9k_eventdev.c
>index 7e8339bd3a..e59e537311 100644
>--- a/drivers/event/cnxk/cn9k_eventdev.c
>+++ b/drivers/event/cnxk/cn9k_eventdev.c
>@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev
>*event_dev)
> struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> int rc;
>
>- rc = cnxk_sso_dev_validate(event_dev);
>+ rc = cnxk_sso_dev_validate(event_dev, 1, 1);
> if (rc < 0) {
> plt_err("Invalid event device configuration");
> return -EINVAL;
>diff --git a/drivers/event/cnxk/cnxk_eventdev.c
>b/drivers/event/cnxk/cnxk_eventdev.c
>index cb9ba5d353..99f9cdcd0d 100644
>--- a/drivers/event/cnxk/cnxk_eventdev.c
>+++ b/drivers/event/cnxk/cnxk_eventdev.c
>@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev
>*event_dev,
> }
>
> int
>-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
>+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t
>deq_depth,
>+ uint32_t enq_depth)
> {
> struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
> struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
>@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev
>*event_dev)
> return -EINVAL;
> }
>
>- if (conf->nb_event_port_dequeue_depth > 1) {
>+ if (conf->nb_event_port_dequeue_depth > deq_depth) {
> plt_err("Unsupported event port deq depth requested");
> return -EINVAL;
> }
>
>- if (conf->nb_event_port_enqueue_depth > 1) {
>+ if (conf->nb_event_port_enqueue_depth > enq_depth) {
> plt_err("Unsupported event port enq depth requested");
> return -EINVAL;
> }
>@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
> }
>
> dev = cnxk_sso_pmd_priv(event_dev);
>+ dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
>+ PLT_CACHE_LINE_SIZE);
>+ if (dev->fc_cache_space == NULL) {
>+ plt_memzone_free(mz);
>+ plt_err("Failed to reserve memory for XAQ fc cache");
>+ return -ENOMEM;
>+ }
>+
> pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
> dev->sso.pci_dev = pci_dev;
>
>diff --git a/drivers/event/cnxk/cnxk_eventdev.h
>b/drivers/event/cnxk/cnxk_eventdev.h
>index c7cbd722ab..a2f30bfe5f 100644
>--- a/drivers/event/cnxk/cnxk_eventdev.h
>+++ b/drivers/event/cnxk/cnxk_eventdev.h
>@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
> uint32_t max_dequeue_timeout_ns;
> int32_t max_num_events;
> uint64_t xaq_lmt;
>+ int64_t *fc_cache_space;
> rte_iova_t fc_iova;
> uint64_t rx_offloads;
> uint64_t tx_offloads;
>@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
> int cnxk_sso_remove(struct rte_pci_device *pci_dev);
> void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
> struct rte_event_dev_info *dev_info);
>-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
>+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
>+ uint32_t deq_depth, uint32_t enq_depth);
> int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
> cnxk_sso_init_hws_mem_t init_hws_mem,
> cnxk_sso_hws_setup_t hws_setup);
>--
>2.25.1
[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 33599 bytes --]
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [EXT] [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine
2023-04-25 19:51 ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
@ 2023-05-18 15:47 ` Shijith Thotton
0 siblings, 0 replies; 12+ messages in thread
From: Shijith Thotton @ 2023-05-18 15:47 UTC (permalink / raw)
To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
Cc: dev, Pavan Nikhilesh Bhagavatula
>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Use the `rte_event_enqueue_new_burst` routine to enqueue events
>with rte_event::op as RTE_EVENT_OP_NEW. This allows PMDs to use
>optimized enqueue routines.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>[]
Acked-by: Shijith Thotton <sthotton@marvell.com>
>---
> app/test-eventdev/evt_options.c | 2 +-
> app/test-eventdev/test_perf_common.c | 58 +++++++++++++++++-----------
> 2 files changed, 37 insertions(+), 23 deletions(-)
>
>diff --git a/app/test-eventdev/evt_options.c b/app/test-eventdev/evt_options.c
>index b175c067cd..03fb3bfce0 100644
>--- a/app/test-eventdev/evt_options.c
>+++ b/app/test-eventdev/evt_options.c
>@@ -27,7 +27,7 @@ evt_options_default(struct evt_options *opt)
> opt->nb_flows = 1024;
> opt->socket_id = SOCKET_ID_ANY;
> opt->pool_sz = 16 * 1024;
>- opt->prod_enq_burst_sz = 1;
>+ opt->prod_enq_burst_sz = 0;
> opt->wkr_deq_dep = 16;
> opt->nb_pkts = (1ULL << 26); /* do ~64M packets */
> opt->nb_timers = 1E8;
>diff --git a/app/test-eventdev/test_perf_common.c b/app/test-
>eventdev/test_perf_common.c
>index fd434666cb..68af3cb346 100644
>--- a/app/test-eventdev/test_perf_common.c
>+++ b/app/test-eventdev/test_perf_common.c
>@@ -131,8 +131,10 @@ perf_producer(void *arg)
> uint32_t flow_counter = 0;
> uint64_t count = 0;
> struct perf_elt *m[BURST_SIZE + 1] = {NULL};
>+ uint8_t enable_fwd_latency;
> struct rte_event ev;
>
>+ enable_fwd_latency = opt->fwd_latency;
> if (opt->verbose_level > 1)
> printf("%s(): lcore %d dev_id %d port=%d queue %d\n",
>__func__,
> rte_lcore_id(), dev_id, port, p->queue_id);
>@@ -151,13 +153,16 @@ perf_producer(void *arg)
> for (i = 0; i < BURST_SIZE; i++) {
> ev.flow_id = flow_counter++ % nb_flows;
> ev.event_ptr = m[i];
>- m[i]->timestamp = rte_get_timer_cycles();
>- while (rte_event_enqueue_burst(dev_id,
>- port, &ev, 1) != 1) {
>+ if (enable_fwd_latency)
>+ m[i]->timestamp = rte_get_timer_cycles();
>+ while (rte_event_enqueue_new_burst(dev_id, port, &ev,
>+ 1) != 1) {
> if (t->done)
> break;
> rte_pause();
>- m[i]->timestamp = rte_get_timer_cycles();
>+ if (enable_fwd_latency)
>+ m[i]->timestamp =
>+ rte_get_timer_cycles();
> }
> }
> count += BURST_SIZE;
>@@ -171,7 +176,6 @@ perf_producer_burst(void *arg)
> {
> uint32_t i;
> uint64_t timestamp;
>- struct rte_event_dev_info dev_info;
> struct prod_data *p = arg;
> struct test_perf *t = p->t;
> struct evt_options *opt = t->opt;
>@@ -183,15 +187,13 @@ perf_producer_burst(void *arg)
> uint32_t flow_counter = 0;
> uint16_t enq = 0;
> uint64_t count = 0;
>- struct perf_elt *m[MAX_PROD_ENQ_BURST_SIZE + 1];
>- struct rte_event ev[MAX_PROD_ENQ_BURST_SIZE + 1];
>+ struct perf_elt *m[opt->prod_enq_burst_sz + 1];
>+ struct rte_event ev[opt->prod_enq_burst_sz + 1];
> uint32_t burst_size = opt->prod_enq_burst_sz;
>+ uint8_t enable_fwd_latency;
>
>- memset(m, 0, sizeof(*m) * (MAX_PROD_ENQ_BURST_SIZE + 1));
>- rte_event_dev_info_get(dev_id, &dev_info);
>- if (dev_info.max_event_port_enqueue_depth < burst_size)
>- burst_size = dev_info.max_event_port_enqueue_depth;
>-
>+ enable_fwd_latency = opt->fwd_latency;
>+ memset(m, 0, sizeof(*m) * (opt->prod_enq_burst_sz + 1));
> if (opt->verbose_level > 1)
> printf("%s(): lcore %d dev_id %d port=%d queue %d\n",
>__func__,
> rte_lcore_id(), dev_id, port, p->queue_id);
>@@ -212,19 +214,21 @@ perf_producer_burst(void *arg)
> for (i = 0; i < burst_size; i++) {
> ev[i].flow_id = flow_counter++ % nb_flows;
> ev[i].event_ptr = m[i];
>- m[i]->timestamp = timestamp;
>+ if (enable_fwd_latency)
>+ m[i]->timestamp = timestamp;
> }
>- enq = rte_event_enqueue_burst(dev_id, port, ev, burst_size);
>+ enq = rte_event_enqueue_new_burst(dev_id, port, ev,
>burst_size);
> while (enq < burst_size) {
>- enq += rte_event_enqueue_burst(dev_id, port,
>- ev + enq,
>- burst_size - enq);
>+ enq += rte_event_enqueue_new_burst(
>+ dev_id, port, ev + enq, burst_size - enq);
> if (t->done)
> break;
> rte_pause();
>- timestamp = rte_get_timer_cycles();
>- for (i = enq; i < burst_size; i++)
>- m[i]->timestamp = timestamp;
>+ if (enable_fwd_latency) {
>+ timestamp = rte_get_timer_cycles();
>+ for (i = enq; i < burst_size; i++)
>+ m[i]->timestamp = timestamp;
>+ }
> }
> count += burst_size;
> }
>@@ -799,9 +803,19 @@ perf_event_crypto_producer_burst(void *arg)
> static int
> perf_producer_wrapper(void *arg)
> {
>+ struct rte_event_dev_info dev_info;
> struct prod_data *p = arg;
> struct test_perf *t = p->t;
>- bool burst = evt_has_burst_mode(p->dev_id);
>+
>+ rte_event_dev_info_get(p->dev_id, &dev_info);
>+ if (!t->opt->prod_enq_burst_sz) {
>+ t->opt->prod_enq_burst_sz = MAX_PROD_ENQ_BURST_SIZE;
>+ if (dev_info.max_event_port_enqueue_depth > 0 &&
>+ (uint32_t)dev_info.max_event_port_enqueue_depth <
>+ t->opt->prod_enq_burst_sz)
>+ t->opt->prod_enq_burst_sz =
>+ dev_info.max_event_port_enqueue_depth;
>+ }
>
> /* In case of synthetic producer, launch perf_producer or
> * perf_producer_burst depending on producer enqueue burst size
>@@ -811,7 +825,7 @@ perf_producer_wrapper(void *arg)
> return perf_producer(arg);
> else if (t->opt->prod_type == EVT_PROD_TYPE_SYNT &&
> t->opt->prod_enq_burst_sz > 1) {
>- if (!burst)
>+ if (dev_info.max_event_port_enqueue_depth == 1)
> evt_err("This event device does not support burst
>mode");
> else
> return perf_producer_burst(arg);
>--
>2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [EXT] [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion
2023-04-25 19:51 ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
@ 2023-05-18 15:47 ` Shijith Thotton
0 siblings, 0 replies; 12+ messages in thread
From: Shijith Thotton @ 2023-05-18 15:47 UTC (permalink / raw)
To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
Cc: dev, Pavan Nikhilesh Bhagavatula
>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Prevent mempool exhaustion due to elements being stuck in lcore
>local caches.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
>---
> app/test-eventdev/test_perf_common.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
>
>diff --git a/app/test-eventdev/test_perf_common.c b/app/test-
>eventdev/test_perf_common.c
>index 68af3cb346..5e0255cfeb 100644
>--- a/app/test-eventdev/test_perf_common.c
>+++ b/app/test-eventdev/test_perf_common.c
>@@ -1859,34 +1859,35 @@ int
> perf_mempool_setup(struct evt_test *test, struct evt_options *opt)
> {
> struct test_perf *t = evt_test_priv(test);
>+ unsigned int cache_sz;
>
>+ cache_sz = RTE_MIN(RTE_MEMPOOL_CACHE_MAX_SIZE, (opt->pool_sz /
>1.5) / t->nb_workers);
> if (opt->prod_type == EVT_PROD_TYPE_SYNT ||
> opt->prod_type ==
>EVT_PROD_TYPE_EVENT_TIMER_ADPTR) {
> t->pool = rte_mempool_create(test->name, /* mempool name */
> opt->pool_sz, /* number of elements*/
> sizeof(struct perf_elt), /* element size*/
>- 512, /* cache size*/
>+ cache_sz, /* cache size*/
> 0, NULL, NULL,
> perf_elt_init, /* obj constructor */
> NULL, opt->socket_id, 0); /* flags */
> } else if (opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR &&
>- opt->crypto_op_type ==
>RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
>+ opt->crypto_op_type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC)
>{
> t->pool = rte_mempool_create(test->name, /* mempool name */
> opt->pool_sz, /* number of elements*/
> sizeof(struct perf_elt) +
>modex_test_case.result_len,
> /* element size*/
>- 512, /* cache size*/
>+ cache_sz, /* cache size*/
> 0, NULL, NULL,
> NULL, /* obj constructor */
> NULL, opt->socket_id, 0); /* flags */
> } else {
> t->pool = rte_pktmbuf_pool_create(test->name, /* mempool
>name */
> opt->pool_sz, /* number of elements*/
>- 512, /* cache size*/
>+ cache_sz, /* cache size*/
> 0,
> RTE_MBUF_DEFAULT_BUF_SIZE,
> opt->socket_id); /* flags */
>-
> }
>
> if (t->pool == NULL) {
>--
>2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst
2023-05-18 15:42 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
@ 2023-05-22 7:23 ` Jerin Jacob
0 siblings, 0 replies; 12+ messages in thread
From: Jerin Jacob @ 2023-05-22 7:23 UTC (permalink / raw)
To: Shijith Thotton
Cc: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
Sunil Kumar Kori, Satha Koteswara Rao Kottidi, dev
On Thu, May 18, 2023 at 9:12 PM Shijith Thotton <sthotton@marvell.com> wrote:
>
> >From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> >Use LMTST when all events in the burst are enqueue with
> >rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
> >with the `rte_event_enqueue_new_burst` API.
> >
> >Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Acked-by: Shijith Thotton <sthotton@marvell.com>
Applied 2/3 and 3/3. to dpdk-next-net-eventdev/for-main. Thanks
Please resend 1/3 to fix the following
### [PATCH] event/cnxk: use LMTST for enqueue new burst
Warning in drivers/event/cnxk/cn10k_worker.c:
Using __atomic_op_fetch, prefer __atomic_fetch_op
2/3 valid patches
checkpatch failed
>
> >---
> >v2 Changes:
> >- Fix spell check.
> >
> > drivers/common/cnxk/hw/sso.h | 1 +
> > drivers/common/cnxk/roc_sso.c | 10 +-
> > drivers/common/cnxk/roc_sso.h | 3 +
> > drivers/event/cnxk/cn10k_eventdev.c | 9 +-
> > drivers/event/cnxk/cn10k_eventdev.h | 6 +-
> > drivers/event/cnxk/cn10k_worker.c | 304 +++++++++++++++++++++++++++-
> > drivers/event/cnxk/cn9k_eventdev.c | 2 +-
> > drivers/event/cnxk/cnxk_eventdev.c | 15 +-
> > drivers/event/cnxk/cnxk_eventdev.h | 4 +-
> > 9 files changed, 338 insertions(+), 16 deletions(-)
> >
> >diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
> >index 25deaa4c14..09b8d4955f 100644
> >--- a/drivers/common/cnxk/hw/sso.h
> >+++ b/drivers/common/cnxk/hw/sso.h
> >@@ -157,6 +157,7 @@
> > #define SSO_LF_GGRP_AQ_CNT (0x1c0ull)
> > #define SSO_LF_GGRP_AQ_THR (0x1e0ull)
> > #define SSO_LF_GGRP_MISC_CNT (0x200ull)
> >+#define SSO_LF_GGRP_OP_AW_LMTST (0x400ull)
> >
> > #define SSO_AF_IAQ_FREE_CNT_MASK 0x3FFFull
> > #define SSO_AF_IAQ_RSVD_FREE_MASK 0x3FFFull
> >diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
> >index 4a6a5080f7..99a55e49b0 100644
> >--- a/drivers/common/cnxk/roc_sso.c
> >+++ b/drivers/common/cnxk/roc_sso.c
> >@@ -6,6 +6,7 @@
> > #include "roc_priv.h"
> >
> > #define SSO_XAQ_CACHE_CNT (0x7)
> >+#define SSO_XAQ_SLACK (16)
> >
> > /* Private functions. */
> > int
> >@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
> >roc_sso_xaq_data *xaq,
> >
> > xaq->nb_xae = nb_xae;
> >
> >- /* Taken from HRM 14.3.3(4) */
> >+ /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork
> >engine
> >+ * is inactive and it might prefetch an additional 0x3 buffers due to
> >+ * pipelining.
> >+ */
> > xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
> > xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq-
> >>nb_xaq);
> >+ xaq->nb_xaq += SSO_XAQ_SLACK;
> >
> > xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
> > if (xaq->mem == NULL) {
> >@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct
> >roc_sso_xaq_data *xaq,
> > * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
> > * to request XAQ to cache them even before enqueue is called.
> > */
> >- xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
> >+ xaq->xaq_lmt =
> >+ xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) -
> >SSO_XAQ_SLACK;
> >
> > return 0;
> > npa_fill_fail:
> >diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> >index e67797b046..a2bb6fcb22 100644
> >--- a/drivers/common/cnxk/roc_sso.h
> >+++ b/drivers/common/cnxk/roc_sso.h
> >@@ -7,6 +7,9 @@
> >
> > #include "hw/ssow.h"
> >
> >+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
> >+#define ROC_SSO_XAE_PER_XAQ 352
> >+
> > struct roc_sso_hwgrp_qos {
> > uint16_t hwgrp;
> > uint8_t xaq_prcnt;
> >diff --git a/drivers/event/cnxk/cn10k_eventdev.c
> >b/drivers/event/cnxk/cn10k_eventdev.c
> >index 071ea5a212..855c92da83 100644
> >--- a/drivers/event/cnxk/cn10k_eventdev.c
> >+++ b/drivers/event/cnxk/cn10k_eventdev.c
> >@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t
> >grp_base)
> > uint64_t val;
> >
> > ws->grp_base = grp_base;
> >- ws->fc_mem = (uint64_t *)dev->fc_iova;
> >+ ws->fc_mem = (int64_t *)dev->fc_iova;
> > ws->xaq_lmt = dev->xaq_lmt;
> >+ ws->fc_cache_space = dev->fc_cache_space;
> >+ ws->aw_lmt = ws->lmt_base;
> >
> > /* Set get_work timeout for HWS */
> > val = NSEC2USEC(dev->deq_tmo_ns);
> >@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
> >
> > dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
> > cnxk_sso_info_get(dev, dev_info);
> >+ dev_info->max_event_port_enqueue_depth = UINT32_MAX;
> > }
> >
> > static int
> >@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev
> >*event_dev)
> > struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> > int rc;
> >
> >- rc = cnxk_sso_dev_validate(event_dev);
> >+ rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
> > if (rc < 0) {
> > plt_err("Invalid event device configuration");
> > return -EINVAL;
> >@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev
> >*event_dev, void *lookup_mem)
> > for (i = 0; i < dev->nb_event_ports; i++) {
> > struct cn10k_sso_hws *ws = event_dev->data->ports[i];
> > ws->xaq_lmt = dev->xaq_lmt;
> >- ws->fc_mem = (uint64_t *)dev->fc_iova;
> >+ ws->fc_mem = (int64_t *)dev->fc_iova;
> > ws->tstamp = dev->tstamp;
> > if (lookup_mem)
> > ws->lookup_mem = lookup_mem;
> >diff --git a/drivers/event/cnxk/cn10k_eventdev.h
> >b/drivers/event/cnxk/cn10k_eventdev.h
> >index aaa01d1ec1..29567728cd 100644
> >--- a/drivers/event/cnxk/cn10k_eventdev.h
> >+++ b/drivers/event/cnxk/cn10k_eventdev.h
> >@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
> > struct cnxk_timesync_info **tstamp;
> > uint64_t meta_aura;
> > /* Add Work Fastpath data */
> >- uint64_t xaq_lmt __rte_cache_aligned;
> >- uint64_t *fc_mem;
> >+ int64_t *fc_mem __rte_cache_aligned;
> >+ int64_t *fc_cache_space;
> >+ uintptr_t aw_lmt;
> > uintptr_t grp_base;
> >+ int32_t xaq_lmt;
> > /* Tx Fastpath data */
> > uintptr_t lmt_base __rte_cache_aligned;
> > uint64_t lso_tun_fmt;
> >diff --git a/drivers/event/cnxk/cn10k_worker.c
> >b/drivers/event/cnxk/cn10k_worker.c
> >index 562d2fca13..1028a12c64 100644
> >--- a/drivers/event/cnxk/cn10k_worker.c
> >+++ b/drivers/event/cnxk/cn10k_worker.c
> >@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
> > cn10k_sso_hws_fwd_group(ws, ev, grp);
> > }
> >
> >+static inline int32_t
> >+sso_read_xaq_space(struct cn10k_sso_hws *ws)
> >+{
> >+ return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem,
> >__ATOMIC_RELAXED)) *
> >+ ROC_SSO_XAE_PER_XAQ;
> >+}
> >+
> >+static inline void
> >+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
> >+{
> >+ int64_t cached, refill;
> >+
> >+retry:
> >+ while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
> >+ ;
> >+
> >+ cached = __atomic_sub_fetch(ws->fc_cache_space, req,
> >__ATOMIC_ACQUIRE);
> >+ /* Check if there is enough space, else update and retry. */
> >+ if (cached < 0) {
> >+ /* Check if we have space else retry. */
> >+ do {
> >+ refill = sso_read_xaq_space(ws);
> >+ } while (refill <= 0);
> >+ __atomic_compare_exchange(ws->fc_cache_space, &cached,
> >&refill,
> >+ 0, __ATOMIC_RELEASE,
> >+ __ATOMIC_RELAXED);
> >+ goto retry;
> >+ }
> >+}
> >+
> > uint16_t __rte_hot
> > cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
> > {
> >@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event
> >*ev)
> > return 1;
> > }
> >
> >+#define VECTOR_SIZE_BITS 0xFFFFFFFFFFF80000ULL
> >+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
> >+
> >+static uint64_t
> >+vector_size_partial_mask(uint16_t off, uint16_t cnt)
> >+{
> >+ return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
> >+ ((uint64_t)(cnt - 1) << off);
> >+}
> >+
> >+static __rte_always_inline uint16_t
> >+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t
> >queue_id,
> >+ const struct rte_event ev[], uint16_t n)
> >+{
> >+ uint16_t lines, partial_line, burst, left;
> >+ uint64_t wdata[2], pa[2] = {0};
> >+ uintptr_t lmt_addr;
> >+ uint16_t sz0, sz1;
> >+ uint16_t lmt_id;
> >+
> >+ sz0 = sz1 = 0;
> >+ lmt_addr = ws->lmt_base;
> >+ ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
> >+
> >+ left = n;
> >+again:
> >+ burst = RTE_MIN(
> >+ BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 +
> >ROC_LMT_LINES_PER_CORE_LOG2),
> >+ left);
> >+
> >+ /* Set wdata */
> >+ lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
> >+ partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
> >+ wdata[0] = wdata[1] = 0;
> >+ if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
> >+ wdata[0] = lmt_id;
> >+ wdata[0] |= 15ULL << 12;
> >+ wdata[0] |= VECTOR_SIZE_BITS;
> >+ pa[0] = (ws->grp_base + (queue_id << 12) +
> >+ SSO_LF_GGRP_OP_AW_LMTST) |
> >+ (0x7 << 4);
> >+ sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> >+
> >+ wdata[1] = lmt_id + 16;
> >+ pa[1] = (ws->grp_base + (queue_id << 12) +
> >+ SSO_LF_GGRP_OP_AW_LMTST) |
> >+ (0x7 << 4);
> >+
> >+ lines -= 17;
> >+ wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> >+ (uint64_t)(lines << 12);
> >+ wdata[1] |= partial_line ?
> >+ vector_size_partial_mask(
> >+ VECTOR_GET_LINE_OFFSET(lines),
> >+ partial_line) :
> >+ VECTOR_SIZE_BITS;
> >+ sz1 = burst - sz0;
> >+ partial_line = 0;
> >+ } else if (lines) {
> >+ /* We need to handle two cases here:
> >+ * 1. Partial line spill over to wdata[1] i.e. lines == 16
> >+ * 2. Partial line with spill lines < 16.
> >+ */
> >+ wdata[0] = lmt_id;
> >+ pa[0] = (ws->grp_base + (queue_id << 12) +
> >+ SSO_LF_GGRP_OP_AW_LMTST) |
> >+ (0x7 << 4);
> >+ sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> >+ if (lines == 16) {
> >+ wdata[0] |= 15ULL << 12;
> >+ wdata[0] |= VECTOR_SIZE_BITS;
> >+ if (partial_line) {
> >+ wdata[1] = lmt_id + 16;
> >+ pa[1] = (ws->grp_base + (queue_id << 12) +
> >+ SSO_LF_GGRP_OP_AW_LMTST) |
> >+ ((partial_line - 1) << 4);
> >+ }
> >+ } else {
> >+ lines -= 1;
> >+ wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> >+ (uint64_t)(lines << 12);
> >+ wdata[0] |=
> >+ partial_line ?
> >+ vector_size_partial_mask(
> >+ VECTOR_GET_LINE_OFFSET(lines),
> >+ partial_line) :
> >+ VECTOR_SIZE_BITS;
> >+ sz0 += partial_line;
> >+ }
> >+ sz1 = burst - sz0;
> >+ partial_line = 0;
> >+ }
> >+
> >+ /* Only partial lines */
> >+ if (partial_line) {
> >+ wdata[0] = lmt_id;
> >+ pa[0] = (ws->grp_base + (queue_id << 12) +
> >+ SSO_LF_GGRP_OP_AW_LMTST) |
> >+ ((partial_line - 1) << 4);
> >+ sz0 = partial_line;
> >+ sz1 = burst - sz0;
> >+ }
> >+
> >+#if defined(RTE_ARCH_ARM64)
> >+ uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
> >+ uint64x2_t tt_mask = {0x300000000ULL, 0};
> >+ uint16_t parts;
> >+
> >+ while (burst) {
> >+ parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
> >+ burst -= parts;
> >+ /* Lets try to fill at least one line per burst. */
> >+ switch (parts) {
> >+ case 8: {
> >+ uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
> >+
> >+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> >+ aw_mask);
> >+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> >+ aw_mask);
> >+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> >+ aw_mask);
> >+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> >+ aw_mask);
> >+ aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
> >+ aw_mask);
> >+ aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
> >+ aw_mask);
> >+ aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
> >+ aw_mask);
> >+ aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
> >+ aw_mask);
> >+
> >+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
> >tt_mask),
> >+ aw0);
> >+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
> >tt_mask),
> >+ aw1);
> >+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
> >tt_mask),
> >+ aw2);
> >+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
> >tt_mask),
> >+ aw3);
> >+ aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6),
> >tt_mask),
> >+ aw4);
> >+ aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6),
> >tt_mask),
> >+ aw5);
> >+ aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6),
> >tt_mask),
> >+ aw6);
> >+ aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6),
> >tt_mask),
> >+ aw7);
> >+
> >+ vst1q_u64((void *)lmt_addr, aw0);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
> >+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
> >+ } break;
> >+ case 4: {
> >+ uint64x2_t aw0, aw1, aw2, aw3;
> >+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> >+ aw_mask);
> >+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> >+ aw_mask);
> >+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> >+ aw_mask);
> >+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> >+ aw_mask);
> >+
> >+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
> >tt_mask),
> >+ aw0);
> >+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
> >tt_mask),
> >+ aw1);
> >+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6),
> >tt_mask),
> >+ aw2);
> >+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6),
> >tt_mask),
> >+ aw3);
> >+
> >+ vst1q_u64((void *)lmt_addr, aw0);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> >+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
> >+ } break;
> >+ case 2: {
> >+ uint64x2_t aw0, aw1;
> >+
> >+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> >+ aw_mask);
> >+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> >+ aw_mask);
> >+
> >+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6),
> >tt_mask),
> >+ aw0);
> >+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6),
> >tt_mask),
> >+ aw1);
> >+
> >+ vst1q_u64((void *)lmt_addr, aw0);
> >+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> >+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
> >+ } break;
> >+ case 1: {
> >+ __uint128_t aw0;
> >+
> >+ aw0 = ev[0].u64;
> >+ aw0 <<= 64;
> >+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> >+ aw0 |= (uint64_t)ev[0].sched_type << 32;
> >+
> >+ *((__uint128_t *)lmt_addr) = aw0;
> >+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> >+ } break;
> >+ }
> >+ ev += parts;
> >+ }
> >+#else
> >+ uint16_t i;
> >+
> >+ for (i = 0; i < burst; i++) {
> >+ __uint128_t aw0;
> >+
> >+ aw0 = ev[0].u64;
> >+ aw0 <<= 64;
> >+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> >+ aw0 |= (uint64_t)ev[0].sched_type << 32;
> >+ *((__uint128_t *)lmt_addr) = aw0;
> >+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> >+ }
> >+#endif
> >+
> >+ /* wdata[0] will be always valid */
> >+ sso_lmt_aw_wait_fc(ws, sz0);
> >+ roc_lmt_submit_steorl(wdata[0], pa[0]);
> >+ if (wdata[1]) {
> >+ sso_lmt_aw_wait_fc(ws, sz1);
> >+ roc_lmt_submit_steorl(wdata[1], pa[1]);
> >+ }
> >+
> >+ left -= (sz0 + sz1);
> >+ if (left)
> >+ goto again;
> >+
> >+ return n;
> >+}
> >+
> > uint16_t __rte_hot
> > cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
> > uint16_t nb_events)
> >@@ -115,13 +392,32 @@ uint16_t __rte_hot
> > cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
> > uint16_t nb_events)
> > {
> >+ uint16_t idx = 0, done = 0, rc = 0;
> > struct cn10k_sso_hws *ws = port;
> >- uint16_t i, rc = 1;
> >+ uint8_t queue_id;
> >+ int32_t space;
> >+
> >+ /* Do a common back-pressure check and return */
> >+ space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
> >+ if (space <= 0)
> >+ return 0;
> >+ nb_events = space < nb_events ? space : nb_events;
> >+
> >+ do {
> >+ queue_id = ev[idx].queue_id;
> >+ for (idx = idx + 1; idx < nb_events; idx++)
> >+ if (queue_id != ev[idx].queue_id)
> >+ break;
> >+
> >+ rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
> >+ idx - done);
> >+ if (rc != (idx - done))
> >+ return rc + done;
> >+ done += rc;
> >
> >- for (i = 0; i < nb_events && rc; i++)
> >- rc = cn10k_sso_hws_new_event(ws, &ev[i]);
> >+ } while (done < nb_events);
> >
> >- return nb_events;
> >+ return done;
> > }
> >
> > uint16_t __rte_hot
> >diff --git a/drivers/event/cnxk/cn9k_eventdev.c
> >b/drivers/event/cnxk/cn9k_eventdev.c
> >index 7e8339bd3a..e59e537311 100644
> >--- a/drivers/event/cnxk/cn9k_eventdev.c
> >+++ b/drivers/event/cnxk/cn9k_eventdev.c
> >@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev
> >*event_dev)
> > struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> > int rc;
> >
> >- rc = cnxk_sso_dev_validate(event_dev);
> >+ rc = cnxk_sso_dev_validate(event_dev, 1, 1);
> > if (rc < 0) {
> > plt_err("Invalid event device configuration");
> > return -EINVAL;
> >diff --git a/drivers/event/cnxk/cnxk_eventdev.c
> >b/drivers/event/cnxk/cnxk_eventdev.c
> >index cb9ba5d353..99f9cdcd0d 100644
> >--- a/drivers/event/cnxk/cnxk_eventdev.c
> >+++ b/drivers/event/cnxk/cnxk_eventdev.c
> >@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev
> >*event_dev,
> > }
> >
> > int
> >-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
> >+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t
> >deq_depth,
> >+ uint32_t enq_depth)
> > {
> > struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
> > struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> >@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev
> >*event_dev)
> > return -EINVAL;
> > }
> >
> >- if (conf->nb_event_port_dequeue_depth > 1) {
> >+ if (conf->nb_event_port_dequeue_depth > deq_depth) {
> > plt_err("Unsupported event port deq depth requested");
> > return -EINVAL;
> > }
> >
> >- if (conf->nb_event_port_enqueue_depth > 1) {
> >+ if (conf->nb_event_port_enqueue_depth > enq_depth) {
> > plt_err("Unsupported event port enq depth requested");
> > return -EINVAL;
> > }
> >@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
> > }
> >
> > dev = cnxk_sso_pmd_priv(event_dev);
> >+ dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
> >+ PLT_CACHE_LINE_SIZE);
> >+ if (dev->fc_cache_space == NULL) {
> >+ plt_memzone_free(mz);
> >+ plt_err("Failed to reserve memory for XAQ fc cache");
> >+ return -ENOMEM;
> >+ }
> >+
> > pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
> > dev->sso.pci_dev = pci_dev;
> >
> >diff --git a/drivers/event/cnxk/cnxk_eventdev.h
> >b/drivers/event/cnxk/cnxk_eventdev.h
> >index c7cbd722ab..a2f30bfe5f 100644
> >--- a/drivers/event/cnxk/cnxk_eventdev.h
> >+++ b/drivers/event/cnxk/cnxk_eventdev.h
> >@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
> > uint32_t max_dequeue_timeout_ns;
> > int32_t max_num_events;
> > uint64_t xaq_lmt;
> >+ int64_t *fc_cache_space;
> > rte_iova_t fc_iova;
> > uint64_t rx_offloads;
> > uint64_t tx_offloads;
> >@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
> > int cnxk_sso_remove(struct rte_pci_device *pci_dev);
> > void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
> > struct rte_event_dev_info *dev_info);
> >-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
> >+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
> >+ uint32_t deq_depth, uint32_t enq_depth);
> > int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
> > cnxk_sso_init_hws_mem_t init_hws_mem,
> > cnxk_sso_hws_setup_t hws_setup);
> >--
> >2.25.1
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3] event/cnxk: use LMTST for enqueue new burst
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
` (2 preceding siblings ...)
2023-05-18 15:42 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
@ 2023-05-22 11:56 ` pbhagavatula
2023-05-23 6:09 ` Jerin Jacob
3 siblings, 1 reply; 12+ messages in thread
From: pbhagavatula @ 2023-05-22 11:56 UTC (permalink / raw)
To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
Satha Rao, Pavan Nikhilesh, Shijith Thotton
Cc: dev
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Use LMTST when all events in the burst are enqueue with
rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
with the `rte_event_enqueue_new_burst` API.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
v3 Changes:
- Fix checkpatch issues.
v2 Changes:
- Fix spell check.
drivers/common/cnxk/hw/sso.h | 1 +
drivers/common/cnxk/roc_sso.c | 10 +-
drivers/common/cnxk/roc_sso.h | 3 +
drivers/event/cnxk/cn10k_eventdev.c | 9 +-
drivers/event/cnxk/cn10k_eventdev.h | 6 +-
drivers/event/cnxk/cn10k_worker.c | 304 +++++++++++++++++++++++++++-
drivers/event/cnxk/cn9k_eventdev.c | 2 +-
drivers/event/cnxk/cnxk_eventdev.c | 15 +-
drivers/event/cnxk/cnxk_eventdev.h | 4 +-
9 files changed, 338 insertions(+), 16 deletions(-)
diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
index 25deaa4c14..09b8d4955f 100644
--- a/drivers/common/cnxk/hw/sso.h
+++ b/drivers/common/cnxk/hw/sso.h
@@ -157,6 +157,7 @@
#define SSO_LF_GGRP_AQ_CNT (0x1c0ull)
#define SSO_LF_GGRP_AQ_THR (0x1e0ull)
#define SSO_LF_GGRP_MISC_CNT (0x200ull)
+#define SSO_LF_GGRP_OP_AW_LMTST (0x400ull)
#define SSO_AF_IAQ_FREE_CNT_MASK 0x3FFFull
#define SSO_AF_IAQ_RSVD_FREE_MASK 0x3FFFull
diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
index 4a6a5080f7..99a55e49b0 100644
--- a/drivers/common/cnxk/roc_sso.c
+++ b/drivers/common/cnxk/roc_sso.c
@@ -6,6 +6,7 @@
#include "roc_priv.h"
#define SSO_XAQ_CACHE_CNT (0x7)
+#define SSO_XAQ_SLACK (16)
/* Private functions. */
int
@@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
xaq->nb_xae = nb_xae;
- /* Taken from HRM 14.3.3(4) */
+ /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork engine
+ * is inactive and it might prefetch an additional 0x3 buffers due to
+ * pipelining.
+ */
xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
+ xaq->nb_xaq += SSO_XAQ_SLACK;
xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
if (xaq->mem == NULL) {
@@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
* There should be a minimum headroom of 7 XAQs per HWGRP for SSO
* to request XAQ to cache them even before enqueue is called.
*/
- xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
+ xaq->xaq_lmt =
+ xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;
return 0;
npa_fill_fail:
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index e67797b046..a2bb6fcb22 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -7,6 +7,9 @@
#include "hw/ssow.h"
+#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
+#define ROC_SSO_XAE_PER_XAQ 352
+
struct roc_sso_hwgrp_qos {
uint16_t hwgrp;
uint8_t xaq_prcnt;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 071ea5a212..855c92da83 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
uint64_t val;
ws->grp_base = grp_base;
- ws->fc_mem = (uint64_t *)dev->fc_iova;
+ ws->fc_mem = (int64_t *)dev->fc_iova;
ws->xaq_lmt = dev->xaq_lmt;
+ ws->fc_cache_space = dev->fc_cache_space;
+ ws->aw_lmt = ws->lmt_base;
/* Set get_work timeout for HWS */
val = NSEC2USEC(dev->deq_tmo_ns);
@@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
cnxk_sso_info_get(dev, dev_info);
+ dev_info->max_event_port_enqueue_depth = UINT32_MAX;
}
static int
@@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
int rc;
- rc = cnxk_sso_dev_validate(event_dev);
+ rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
if (rc < 0) {
plt_err("Invalid event device configuration");
return -EINVAL;
@@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
for (i = 0; i < dev->nb_event_ports; i++) {
struct cn10k_sso_hws *ws = event_dev->data->ports[i];
ws->xaq_lmt = dev->xaq_lmt;
- ws->fc_mem = (uint64_t *)dev->fc_iova;
+ ws->fc_mem = (int64_t *)dev->fc_iova;
ws->tstamp = dev->tstamp;
if (lookup_mem)
ws->lookup_mem = lookup_mem;
diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
index aaa01d1ec1..29567728cd 100644
--- a/drivers/event/cnxk/cn10k_eventdev.h
+++ b/drivers/event/cnxk/cn10k_eventdev.h
@@ -19,9 +19,11 @@ struct cn10k_sso_hws {
struct cnxk_timesync_info **tstamp;
uint64_t meta_aura;
/* Add Work Fastpath data */
- uint64_t xaq_lmt __rte_cache_aligned;
- uint64_t *fc_mem;
+ int64_t *fc_mem __rte_cache_aligned;
+ int64_t *fc_cache_space;
+ uintptr_t aw_lmt;
uintptr_t grp_base;
+ int32_t xaq_lmt;
/* Tx Fastpath data */
uintptr_t lmt_base __rte_cache_aligned;
uint64_t lso_tun_fmt;
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 562d2fca13..9b5bf90159 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
cn10k_sso_hws_fwd_group(ws, ev, grp);
}
+static inline int32_t
+sso_read_xaq_space(struct cn10k_sso_hws *ws)
+{
+ return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
+ ROC_SSO_XAE_PER_XAQ;
+}
+
+static inline void
+sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
+{
+ int64_t cached, refill;
+
+retry:
+ while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
+ ;
+
+ cached = __atomic_fetch_sub(ws->fc_cache_space, req, __ATOMIC_ACQUIRE) - req;
+ /* Check if there is enough space, else update and retry. */
+ if (cached < 0) {
+ /* Check if we have space else retry. */
+ do {
+ refill = sso_read_xaq_space(ws);
+ } while (refill <= 0);
+ __atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
+ 0, __ATOMIC_RELEASE,
+ __ATOMIC_RELAXED);
+ goto retry;
+ }
+}
+
uint16_t __rte_hot
cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
{
@@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
return 1;
}
+#define VECTOR_SIZE_BITS 0xFFFFFFFFFFF80000ULL
+#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
+
+static uint64_t
+vector_size_partial_mask(uint16_t off, uint16_t cnt)
+{
+ return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
+ ((uint64_t)(cnt - 1) << off);
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
+ const struct rte_event ev[], uint16_t n)
+{
+ uint16_t lines, partial_line, burst, left;
+ uint64_t wdata[2], pa[2] = {0};
+ uintptr_t lmt_addr;
+ uint16_t sz0, sz1;
+ uint16_t lmt_id;
+
+ sz0 = sz1 = 0;
+ lmt_addr = ws->lmt_base;
+ ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+ left = n;
+again:
+ burst = RTE_MIN(
+ BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
+ left);
+
+ /* Set wdata */
+ lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
+ partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
+ wdata[0] = wdata[1] = 0;
+ if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
+ wdata[0] = lmt_id;
+ wdata[0] |= 15ULL << 12;
+ wdata[0] |= VECTOR_SIZE_BITS;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+ sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+
+ wdata[1] = lmt_id + 16;
+ pa[1] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+
+ lines -= 17;
+ wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+ (uint64_t)(lines << 12);
+ wdata[1] |= partial_line ?
+ vector_size_partial_mask(
+ VECTOR_GET_LINE_OFFSET(lines),
+ partial_line) :
+ VECTOR_SIZE_BITS;
+ sz1 = burst - sz0;
+ partial_line = 0;
+ } else if (lines) {
+ /* We need to handle two cases here:
+ * 1. Partial line spill over to wdata[1] i.e. lines == 16
+ * 2. Partial line with spill lines < 16.
+ */
+ wdata[0] = lmt_id;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ (0x7 << 4);
+ sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
+ if (lines == 16) {
+ wdata[0] |= 15ULL << 12;
+ wdata[0] |= VECTOR_SIZE_BITS;
+ if (partial_line) {
+ wdata[1] = lmt_id + 16;
+ pa[1] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ ((partial_line - 1) << 4);
+ }
+ } else {
+ lines -= 1;
+ wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
+ (uint64_t)(lines << 12);
+ wdata[0] |=
+ partial_line ?
+ vector_size_partial_mask(
+ VECTOR_GET_LINE_OFFSET(lines),
+ partial_line) :
+ VECTOR_SIZE_BITS;
+ sz0 += partial_line;
+ }
+ sz1 = burst - sz0;
+ partial_line = 0;
+ }
+
+ /* Only partial lines */
+ if (partial_line) {
+ wdata[0] = lmt_id;
+ pa[0] = (ws->grp_base + (queue_id << 12) +
+ SSO_LF_GGRP_OP_AW_LMTST) |
+ ((partial_line - 1) << 4);
+ sz0 = partial_line;
+ sz1 = burst - sz0;
+ }
+
+#if defined(RTE_ARCH_ARM64)
+ uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
+ uint64x2_t tt_mask = {0x300000000ULL, 0};
+ uint16_t parts;
+
+ while (burst) {
+ parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
+ burst -= parts;
+ /* Lets try to fill at least one line per burst. */
+ switch (parts) {
+ case 8: {
+ uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
+
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+ aw_mask);
+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+ aw_mask);
+ aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
+ aw_mask);
+ aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
+ aw_mask);
+ aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
+ aw_mask);
+ aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+ aw2);
+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+ aw3);
+ aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
+ aw4);
+ aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
+ aw5);
+ aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
+ aw6);
+ aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
+ aw7);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
+ } break;
+ case 4: {
+ uint64x2_t aw0, aw1, aw2, aw3;
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+ aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
+ aw_mask);
+ aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+ aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
+ aw2);
+ aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
+ aw3);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
+ } break;
+ case 2: {
+ uint64x2_t aw0, aw1;
+
+ aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
+ aw_mask);
+ aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
+ aw_mask);
+
+ aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
+ aw0);
+ aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
+ aw1);
+
+ vst1q_u64((void *)lmt_addr, aw0);
+ vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
+ } break;
+ case 1: {
+ __uint128_t aw0;
+
+ aw0 = ev[0].u64;
+ aw0 <<= 64;
+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+ aw0 |= (uint64_t)ev[0].sched_type << 32;
+
+ *((__uint128_t *)lmt_addr) = aw0;
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+ } break;
+ }
+ ev += parts;
+ }
+#else
+ uint16_t i;
+
+ for (i = 0; i < burst; i++) {
+ __uint128_t aw0;
+
+ aw0 = ev[0].u64;
+ aw0 <<= 64;
+ aw0 |= ev[0].event & (BIT_ULL(32) - 1);
+ aw0 |= (uint64_t)ev[0].sched_type << 32;
+ *((__uint128_t *)lmt_addr) = aw0;
+ lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
+ }
+#endif
+
+ /* wdata[0] will be always valid */
+ sso_lmt_aw_wait_fc(ws, sz0);
+ roc_lmt_submit_steorl(wdata[0], pa[0]);
+ if (wdata[1]) {
+ sso_lmt_aw_wait_fc(ws, sz1);
+ roc_lmt_submit_steorl(wdata[1], pa[1]);
+ }
+
+ left -= (sz0 + sz1);
+ if (left)
+ goto again;
+
+ return n;
+}
+
uint16_t __rte_hot
cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
uint16_t nb_events)
@@ -115,13 +392,32 @@ uint16_t __rte_hot
cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
uint16_t nb_events)
{
+ uint16_t idx = 0, done = 0, rc = 0;
struct cn10k_sso_hws *ws = port;
- uint16_t i, rc = 1;
+ uint8_t queue_id;
+ int32_t space;
+
+ /* Do a common back-pressure check and return */
+ space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
+ if (space <= 0)
+ return 0;
+ nb_events = space < nb_events ? space : nb_events;
+
+ do {
+ queue_id = ev[idx].queue_id;
+ for (idx = idx + 1; idx < nb_events; idx++)
+ if (queue_id != ev[idx].queue_id)
+ break;
+
+ rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
+ idx - done);
+ if (rc != (idx - done))
+ return rc + done;
+ done += rc;
- for (i = 0; i < nb_events && rc; i++)
- rc = cn10k_sso_hws_new_event(ws, &ev[i]);
+ } while (done < nb_events);
- return nb_events;
+ return done;
}
uint16_t __rte_hot
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7e8339bd3a..e59e537311 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
int rc;
- rc = cnxk_sso_dev_validate(event_dev);
+ rc = cnxk_sso_dev_validate(event_dev, 1, 1);
if (rc < 0) {
plt_err("Invalid event device configuration");
return -EINVAL;
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index cb9ba5d353..99f9cdcd0d 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
}
int
-cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
+cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
+ uint32_t enq_depth)
{
struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
@@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
return -EINVAL;
}
- if (conf->nb_event_port_dequeue_depth > 1) {
+ if (conf->nb_event_port_dequeue_depth > deq_depth) {
plt_err("Unsupported event port deq depth requested");
return -EINVAL;
}
- if (conf->nb_event_port_enqueue_depth > 1) {
+ if (conf->nb_event_port_enqueue_depth > enq_depth) {
plt_err("Unsupported event port enq depth requested");
return -EINVAL;
}
@@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
}
dev = cnxk_sso_pmd_priv(event_dev);
+ dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
+ PLT_CACHE_LINE_SIZE);
+ if (dev->fc_cache_space == NULL) {
+ plt_memzone_free(mz);
+ plt_err("Failed to reserve memory for XAQ fc cache");
+ return -ENOMEM;
+ }
+
pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
dev->sso.pci_dev = pci_dev;
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index c7cbd722ab..a2f30bfe5f 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
uint32_t max_dequeue_timeout_ns;
int32_t max_num_events;
uint64_t xaq_lmt;
+ int64_t *fc_cache_space;
rte_iova_t fc_iova;
uint64_t rx_offloads;
uint64_t tx_offloads;
@@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
int cnxk_sso_remove(struct rte_pci_device *pci_dev);
void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
struct rte_event_dev_info *dev_info);
-int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
+int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
+ uint32_t deq_depth, uint32_t enq_depth);
int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
cnxk_sso_init_hws_mem_t init_hws_mem,
cnxk_sso_hws_setup_t hws_setup);
--
2.25.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3] event/cnxk: use LMTST for enqueue new burst
2023-05-22 11:56 ` [PATCH v3] " pbhagavatula
@ 2023-05-23 6:09 ` Jerin Jacob
0 siblings, 0 replies; 12+ messages in thread
From: Jerin Jacob @ 2023-05-23 6:09 UTC (permalink / raw)
To: pbhagavatula
Cc: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
Satha Rao, Shijith Thotton, dev
On Mon, May 22, 2023 at 5:26 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use LMTST when all events in the burst are enqueue with
> rte_event:op as RTE_EVENT_OP_NEW i.e. events are enqueued
> with the `rte_event_enqueue_new_burst` API.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Acked-by: Shijith Thotton <sthotton@marvell.com>
Applied to dpdk-next-net-eventdev/for-main. Thanks
> ---
> v3 Changes:
> - Fix checkpatch issues.
> v2 Changes:
> - Fix spell check.
>
> drivers/common/cnxk/hw/sso.h | 1 +
> drivers/common/cnxk/roc_sso.c | 10 +-
> drivers/common/cnxk/roc_sso.h | 3 +
> drivers/event/cnxk/cn10k_eventdev.c | 9 +-
> drivers/event/cnxk/cn10k_eventdev.h | 6 +-
> drivers/event/cnxk/cn10k_worker.c | 304 +++++++++++++++++++++++++++-
> drivers/event/cnxk/cn9k_eventdev.c | 2 +-
> drivers/event/cnxk/cnxk_eventdev.c | 15 +-
> drivers/event/cnxk/cnxk_eventdev.h | 4 +-
> 9 files changed, 338 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/common/cnxk/hw/sso.h b/drivers/common/cnxk/hw/sso.h
> index 25deaa4c14..09b8d4955f 100644
> --- a/drivers/common/cnxk/hw/sso.h
> +++ b/drivers/common/cnxk/hw/sso.h
> @@ -157,6 +157,7 @@
> #define SSO_LF_GGRP_AQ_CNT (0x1c0ull)
> #define SSO_LF_GGRP_AQ_THR (0x1e0ull)
> #define SSO_LF_GGRP_MISC_CNT (0x200ull)
> +#define SSO_LF_GGRP_OP_AW_LMTST (0x400ull)
>
> #define SSO_AF_IAQ_FREE_CNT_MASK 0x3FFFull
> #define SSO_AF_IAQ_RSVD_FREE_MASK 0x3FFFull
> diff --git a/drivers/common/cnxk/roc_sso.c b/drivers/common/cnxk/roc_sso.c
> index 4a6a5080f7..99a55e49b0 100644
> --- a/drivers/common/cnxk/roc_sso.c
> +++ b/drivers/common/cnxk/roc_sso.c
> @@ -6,6 +6,7 @@
> #include "roc_priv.h"
>
> #define SSO_XAQ_CACHE_CNT (0x7)
> +#define SSO_XAQ_SLACK (16)
>
> /* Private functions. */
> int
> @@ -493,9 +494,13 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
>
> xaq->nb_xae = nb_xae;
>
> - /* Taken from HRM 14.3.3(4) */
> + /** SSO will reserve up to 0x4 XAQ buffers per group when GetWork engine
> + * is inactive and it might prefetch an additional 0x3 buffers due to
> + * pipelining.
> + */
> xaq->nb_xaq = (SSO_XAQ_CACHE_CNT * nb_hwgrp);
> xaq->nb_xaq += PLT_MAX(1 + ((xaq->nb_xae - 1) / xae_waes), xaq->nb_xaq);
> + xaq->nb_xaq += SSO_XAQ_SLACK;
>
> xaq->mem = plt_zmalloc(xaq_buf_size * xaq->nb_xaq, xaq_buf_size);
> if (xaq->mem == NULL) {
> @@ -537,7 +542,8 @@ sso_hwgrp_init_xaq_aura(struct dev *dev, struct roc_sso_xaq_data *xaq,
> * There should be a minimum headroom of 7 XAQs per HWGRP for SSO
> * to request XAQ to cache them even before enqueue is called.
> */
> - xaq->xaq_lmt = xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT);
> + xaq->xaq_lmt =
> + xaq->nb_xaq - (nb_hwgrp * SSO_XAQ_CACHE_CNT) - SSO_XAQ_SLACK;
>
> return 0;
> npa_fill_fail:
> diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> index e67797b046..a2bb6fcb22 100644
> --- a/drivers/common/cnxk/roc_sso.h
> +++ b/drivers/common/cnxk/roc_sso.h
> @@ -7,6 +7,9 @@
>
> #include "hw/ssow.h"
>
> +#define ROC_SSO_AW_PER_LMT_LINE_LOG2 3
> +#define ROC_SSO_XAE_PER_XAQ 352
> +
> struct roc_sso_hwgrp_qos {
> uint16_t hwgrp;
> uint8_t xaq_prcnt;
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index 071ea5a212..855c92da83 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -91,8 +91,10 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
> uint64_t val;
>
> ws->grp_base = grp_base;
> - ws->fc_mem = (uint64_t *)dev->fc_iova;
> + ws->fc_mem = (int64_t *)dev->fc_iova;
> ws->xaq_lmt = dev->xaq_lmt;
> + ws->fc_cache_space = dev->fc_cache_space;
> + ws->aw_lmt = ws->lmt_base;
>
> /* Set get_work timeout for HWS */
> val = NSEC2USEC(dev->deq_tmo_ns);
> @@ -624,6 +626,7 @@ cn10k_sso_info_get(struct rte_eventdev *event_dev,
>
> dev_info->driver_name = RTE_STR(EVENTDEV_NAME_CN10K_PMD);
> cnxk_sso_info_get(dev, dev_info);
> + dev_info->max_event_port_enqueue_depth = UINT32_MAX;
> }
>
> static int
> @@ -632,7 +635,7 @@ cn10k_sso_dev_configure(const struct rte_eventdev *event_dev)
> struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> int rc;
>
> - rc = cnxk_sso_dev_validate(event_dev);
> + rc = cnxk_sso_dev_validate(event_dev, 1, UINT32_MAX);
> if (rc < 0) {
> plt_err("Invalid event device configuration");
> return -EINVAL;
> @@ -871,7 +874,7 @@ cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
> for (i = 0; i < dev->nb_event_ports; i++) {
> struct cn10k_sso_hws *ws = event_dev->data->ports[i];
> ws->xaq_lmt = dev->xaq_lmt;
> - ws->fc_mem = (uint64_t *)dev->fc_iova;
> + ws->fc_mem = (int64_t *)dev->fc_iova;
> ws->tstamp = dev->tstamp;
> if (lookup_mem)
> ws->lookup_mem = lookup_mem;
> diff --git a/drivers/event/cnxk/cn10k_eventdev.h b/drivers/event/cnxk/cn10k_eventdev.h
> index aaa01d1ec1..29567728cd 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.h
> +++ b/drivers/event/cnxk/cn10k_eventdev.h
> @@ -19,9 +19,11 @@ struct cn10k_sso_hws {
> struct cnxk_timesync_info **tstamp;
> uint64_t meta_aura;
> /* Add Work Fastpath data */
> - uint64_t xaq_lmt __rte_cache_aligned;
> - uint64_t *fc_mem;
> + int64_t *fc_mem __rte_cache_aligned;
> + int64_t *fc_cache_space;
> + uintptr_t aw_lmt;
> uintptr_t grp_base;
> + int32_t xaq_lmt;
> /* Tx Fastpath data */
> uintptr_t lmt_base __rte_cache_aligned;
> uint64_t lso_tun_fmt;
> diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
> index 562d2fca13..9b5bf90159 100644
> --- a/drivers/event/cnxk/cn10k_worker.c
> +++ b/drivers/event/cnxk/cn10k_worker.c
> @@ -77,6 +77,36 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
> cn10k_sso_hws_fwd_group(ws, ev, grp);
> }
>
> +static inline int32_t
> +sso_read_xaq_space(struct cn10k_sso_hws *ws)
> +{
> + return (ws->xaq_lmt - __atomic_load_n(ws->fc_mem, __ATOMIC_RELAXED)) *
> + ROC_SSO_XAE_PER_XAQ;
> +}
> +
> +static inline void
> +sso_lmt_aw_wait_fc(struct cn10k_sso_hws *ws, int64_t req)
> +{
> + int64_t cached, refill;
> +
> +retry:
> + while (__atomic_load_n(ws->fc_cache_space, __ATOMIC_RELAXED) < 0)
> + ;
> +
> + cached = __atomic_fetch_sub(ws->fc_cache_space, req, __ATOMIC_ACQUIRE) - req;
> + /* Check if there is enough space, else update and retry. */
> + if (cached < 0) {
> + /* Check if we have space else retry. */
> + do {
> + refill = sso_read_xaq_space(ws);
> + } while (refill <= 0);
> + __atomic_compare_exchange(ws->fc_cache_space, &cached, &refill,
> + 0, __ATOMIC_RELEASE,
> + __ATOMIC_RELAXED);
> + goto retry;
> + }
> +}
> +
> uint16_t __rte_hot
> cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
> {
> @@ -103,6 +133,253 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
> return 1;
> }
>
> +#define VECTOR_SIZE_BITS 0xFFFFFFFFFFF80000ULL
> +#define VECTOR_GET_LINE_OFFSET(line) (19 + (3 * line))
> +
> +static uint64_t
> +vector_size_partial_mask(uint16_t off, uint16_t cnt)
> +{
> + return (VECTOR_SIZE_BITS & ~(~0x0ULL << off)) |
> + ((uint64_t)(cnt - 1) << off);
> +}
> +
> +static __rte_always_inline uint16_t
> +cn10k_sso_hws_new_event_lmtst(struct cn10k_sso_hws *ws, uint8_t queue_id,
> + const struct rte_event ev[], uint16_t n)
> +{
> + uint16_t lines, partial_line, burst, left;
> + uint64_t wdata[2], pa[2] = {0};
> + uintptr_t lmt_addr;
> + uint16_t sz0, sz1;
> + uint16_t lmt_id;
> +
> + sz0 = sz1 = 0;
> + lmt_addr = ws->lmt_base;
> + ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
> +
> + left = n;
> +again:
> + burst = RTE_MIN(
> + BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2 + ROC_LMT_LINES_PER_CORE_LOG2),
> + left);
> +
> + /* Set wdata */
> + lines = burst >> ROC_SSO_AW_PER_LMT_LINE_LOG2;
> + partial_line = burst & (BIT(ROC_SSO_AW_PER_LMT_LINE_LOG2) - 1);
> + wdata[0] = wdata[1] = 0;
> + if (lines > BIT(ROC_LMT_LINES_PER_STR_LOG2)) {
> + wdata[0] = lmt_id;
> + wdata[0] |= 15ULL << 12;
> + wdata[0] |= VECTOR_SIZE_BITS;
> + pa[0] = (ws->grp_base + (queue_id << 12) +
> + SSO_LF_GGRP_OP_AW_LMTST) |
> + (0x7 << 4);
> + sz0 = 16 << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> +
> + wdata[1] = lmt_id + 16;
> + pa[1] = (ws->grp_base + (queue_id << 12) +
> + SSO_LF_GGRP_OP_AW_LMTST) |
> + (0x7 << 4);
> +
> + lines -= 17;
> + wdata[1] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> + (uint64_t)(lines << 12);
> + wdata[1] |= partial_line ?
> + vector_size_partial_mask(
> + VECTOR_GET_LINE_OFFSET(lines),
> + partial_line) :
> + VECTOR_SIZE_BITS;
> + sz1 = burst - sz0;
> + partial_line = 0;
> + } else if (lines) {
> + /* We need to handle two cases here:
> + * 1. Partial line spill over to wdata[1] i.e. lines == 16
> + * 2. Partial line with spill lines < 16.
> + */
> + wdata[0] = lmt_id;
> + pa[0] = (ws->grp_base + (queue_id << 12) +
> + SSO_LF_GGRP_OP_AW_LMTST) |
> + (0x7 << 4);
> + sz0 = lines << ROC_SSO_AW_PER_LMT_LINE_LOG2;
> + if (lines == 16) {
> + wdata[0] |= 15ULL << 12;
> + wdata[0] |= VECTOR_SIZE_BITS;
> + if (partial_line) {
> + wdata[1] = lmt_id + 16;
> + pa[1] = (ws->grp_base + (queue_id << 12) +
> + SSO_LF_GGRP_OP_AW_LMTST) |
> + ((partial_line - 1) << 4);
> + }
> + } else {
> + lines -= 1;
> + wdata[0] |= partial_line ? (uint64_t)(lines + 1) << 12 :
> + (uint64_t)(lines << 12);
> + wdata[0] |=
> + partial_line ?
> + vector_size_partial_mask(
> + VECTOR_GET_LINE_OFFSET(lines),
> + partial_line) :
> + VECTOR_SIZE_BITS;
> + sz0 += partial_line;
> + }
> + sz1 = burst - sz0;
> + partial_line = 0;
> + }
> +
> + /* Only partial lines */
> + if (partial_line) {
> + wdata[0] = lmt_id;
> + pa[0] = (ws->grp_base + (queue_id << 12) +
> + SSO_LF_GGRP_OP_AW_LMTST) |
> + ((partial_line - 1) << 4);
> + sz0 = partial_line;
> + sz1 = burst - sz0;
> + }
> +
> +#if defined(RTE_ARCH_ARM64)
> + uint64x2_t aw_mask = {0xC0FFFFFFFFULL, ~0x0ULL};
> + uint64x2_t tt_mask = {0x300000000ULL, 0};
> + uint16_t parts;
> +
> + while (burst) {
> + parts = burst > 7 ? 8 : plt_align32prevpow2(burst);
> + burst -= parts;
> + /* Lets try to fill at least one line per burst. */
> + switch (parts) {
> + case 8: {
> + uint64x2_t aw0, aw1, aw2, aw3, aw4, aw5, aw6, aw7;
> +
> + aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> + aw_mask);
> + aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> + aw_mask);
> + aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> + aw_mask);
> + aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> + aw_mask);
> + aw4 = vandq_u64(vld1q_u64((const uint64_t *)&ev[4]),
> + aw_mask);
> + aw5 = vandq_u64(vld1q_u64((const uint64_t *)&ev[5]),
> + aw_mask);
> + aw6 = vandq_u64(vld1q_u64((const uint64_t *)&ev[6]),
> + aw_mask);
> + aw7 = vandq_u64(vld1q_u64((const uint64_t *)&ev[7]),
> + aw_mask);
> +
> + aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
> + aw0);
> + aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
> + aw1);
> + aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
> + aw2);
> + aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
> + aw3);
> + aw4 = vorrq_u64(vandq_u64(vshrq_n_u64(aw4, 6), tt_mask),
> + aw4);
> + aw5 = vorrq_u64(vandq_u64(vshrq_n_u64(aw5, 6), tt_mask),
> + aw5);
> + aw6 = vorrq_u64(vandq_u64(vshrq_n_u64(aw6, 6), tt_mask),
> + aw6);
> + aw7 = vorrq_u64(vandq_u64(vshrq_n_u64(aw7, 6), tt_mask),
> + aw7);
> +
> + vst1q_u64((void *)lmt_addr, aw0);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 64), aw4);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 80), aw5);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 96), aw6);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 112), aw7);
> + lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 128);
> + } break;
> + case 4: {
> + uint64x2_t aw0, aw1, aw2, aw3;
> + aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> + aw_mask);
> + aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> + aw_mask);
> + aw2 = vandq_u64(vld1q_u64((const uint64_t *)&ev[2]),
> + aw_mask);
> + aw3 = vandq_u64(vld1q_u64((const uint64_t *)&ev[3]),
> + aw_mask);
> +
> + aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
> + aw0);
> + aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
> + aw1);
> + aw2 = vorrq_u64(vandq_u64(vshrq_n_u64(aw2, 6), tt_mask),
> + aw2);
> + aw3 = vorrq_u64(vandq_u64(vshrq_n_u64(aw3, 6), tt_mask),
> + aw3);
> +
> + vst1q_u64((void *)lmt_addr, aw0);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 32), aw2);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 48), aw3);
> + lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 64);
> + } break;
> + case 2: {
> + uint64x2_t aw0, aw1;
> +
> + aw0 = vandq_u64(vld1q_u64((const uint64_t *)&ev[0]),
> + aw_mask);
> + aw1 = vandq_u64(vld1q_u64((const uint64_t *)&ev[1]),
> + aw_mask);
> +
> + aw0 = vorrq_u64(vandq_u64(vshrq_n_u64(aw0, 6), tt_mask),
> + aw0);
> + aw1 = vorrq_u64(vandq_u64(vshrq_n_u64(aw1, 6), tt_mask),
> + aw1);
> +
> + vst1q_u64((void *)lmt_addr, aw0);
> + vst1q_u64((void *)PLT_PTR_ADD(lmt_addr, 16), aw1);
> + lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 32);
> + } break;
> + case 1: {
> + __uint128_t aw0;
> +
> + aw0 = ev[0].u64;
> + aw0 <<= 64;
> + aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> + aw0 |= (uint64_t)ev[0].sched_type << 32;
> +
> + *((__uint128_t *)lmt_addr) = aw0;
> + lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> + } break;
> + }
> + ev += parts;
> + }
> +#else
> + uint16_t i;
> +
> + for (i = 0; i < burst; i++) {
> + __uint128_t aw0;
> +
> + aw0 = ev[0].u64;
> + aw0 <<= 64;
> + aw0 |= ev[0].event & (BIT_ULL(32) - 1);
> + aw0 |= (uint64_t)ev[0].sched_type << 32;
> + *((__uint128_t *)lmt_addr) = aw0;
> + lmt_addr = (uintptr_t)PLT_PTR_ADD(lmt_addr, 16);
> + }
> +#endif
> +
> + /* wdata[0] will be always valid */
> + sso_lmt_aw_wait_fc(ws, sz0);
> + roc_lmt_submit_steorl(wdata[0], pa[0]);
> + if (wdata[1]) {
> + sso_lmt_aw_wait_fc(ws, sz1);
> + roc_lmt_submit_steorl(wdata[1], pa[1]);
> + }
> +
> + left -= (sz0 + sz1);
> + if (left)
> + goto again;
> +
> + return n;
> +}
> +
> uint16_t __rte_hot
> cn10k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
> uint16_t nb_events)
> @@ -115,13 +392,32 @@ uint16_t __rte_hot
> cn10k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
> uint16_t nb_events)
> {
> + uint16_t idx = 0, done = 0, rc = 0;
> struct cn10k_sso_hws *ws = port;
> - uint16_t i, rc = 1;
> + uint8_t queue_id;
> + int32_t space;
> +
> + /* Do a common back-pressure check and return */
> + space = sso_read_xaq_space(ws) - ROC_SSO_XAE_PER_XAQ;
> + if (space <= 0)
> + return 0;
> + nb_events = space < nb_events ? space : nb_events;
> +
> + do {
> + queue_id = ev[idx].queue_id;
> + for (idx = idx + 1; idx < nb_events; idx++)
> + if (queue_id != ev[idx].queue_id)
> + break;
> +
> + rc = cn10k_sso_hws_new_event_lmtst(ws, queue_id, &ev[done],
> + idx - done);
> + if (rc != (idx - done))
> + return rc + done;
> + done += rc;
>
> - for (i = 0; i < nb_events && rc; i++)
> - rc = cn10k_sso_hws_new_event(ws, &ev[i]);
> + } while (done < nb_events);
>
> - return nb_events;
> + return done;
> }
>
> uint16_t __rte_hot
> diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
> index 7e8339bd3a..e59e537311 100644
> --- a/drivers/event/cnxk/cn9k_eventdev.c
> +++ b/drivers/event/cnxk/cn9k_eventdev.c
> @@ -753,7 +753,7 @@ cn9k_sso_dev_configure(const struct rte_eventdev *event_dev)
> struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> int rc;
>
> - rc = cnxk_sso_dev_validate(event_dev);
> + rc = cnxk_sso_dev_validate(event_dev, 1, 1);
> if (rc < 0) {
> plt_err("Invalid event device configuration");
> return -EINVAL;
> diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
> index cb9ba5d353..99f9cdcd0d 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.c
> +++ b/drivers/event/cnxk/cnxk_eventdev.c
> @@ -145,7 +145,8 @@ cnxk_sso_restore_links(const struct rte_eventdev *event_dev,
> }
>
> int
> -cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
> +cnxk_sso_dev_validate(const struct rte_eventdev *event_dev, uint32_t deq_depth,
> + uint32_t enq_depth)
> {
> struct rte_event_dev_config *conf = &event_dev->data->dev_conf;
> struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> @@ -173,12 +174,12 @@ cnxk_sso_dev_validate(const struct rte_eventdev *event_dev)
> return -EINVAL;
> }
>
> - if (conf->nb_event_port_dequeue_depth > 1) {
> + if (conf->nb_event_port_dequeue_depth > deq_depth) {
> plt_err("Unsupported event port deq depth requested");
> return -EINVAL;
> }
>
> - if (conf->nb_event_port_enqueue_depth > 1) {
> + if (conf->nb_event_port_enqueue_depth > enq_depth) {
> plt_err("Unsupported event port enq depth requested");
> return -EINVAL;
> }
> @@ -630,6 +631,14 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
> }
>
> dev = cnxk_sso_pmd_priv(event_dev);
> + dev->fc_cache_space = rte_zmalloc("fc_cache", PLT_CACHE_LINE_SIZE,
> + PLT_CACHE_LINE_SIZE);
> + if (dev->fc_cache_space == NULL) {
> + plt_memzone_free(mz);
> + plt_err("Failed to reserve memory for XAQ fc cache");
> + return -ENOMEM;
> + }
> +
> pci_dev = container_of(event_dev->dev, struct rte_pci_device, device);
> dev->sso.pci_dev = pci_dev;
>
> diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
> index c7cbd722ab..a2f30bfe5f 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.h
> +++ b/drivers/event/cnxk/cnxk_eventdev.h
> @@ -90,6 +90,7 @@ struct cnxk_sso_evdev {
> uint32_t max_dequeue_timeout_ns;
> int32_t max_num_events;
> uint64_t xaq_lmt;
> + int64_t *fc_cache_space;
> rte_iova_t fc_iova;
> uint64_t rx_offloads;
> uint64_t tx_offloads;
> @@ -206,7 +207,8 @@ int cnxk_sso_fini(struct rte_eventdev *event_dev);
> int cnxk_sso_remove(struct rte_pci_device *pci_dev);
> void cnxk_sso_info_get(struct cnxk_sso_evdev *dev,
> struct rte_event_dev_info *dev_info);
> -int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev);
> +int cnxk_sso_dev_validate(const struct rte_eventdev *event_dev,
> + uint32_t deq_depth, uint32_t enq_depth);
> int cnxk_setup_event_ports(const struct rte_eventdev *event_dev,
> cnxk_sso_init_hws_mem_t init_hws_mem,
> cnxk_sso_hws_setup_t hws_setup);
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2023-05-23 6:09 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-19 20:01 [PATCH 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-19 20:01 ` [PATCH 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
2023-04-19 20:01 ` [PATCH 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
2023-04-25 19:51 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst pbhagavatula
2023-04-25 19:51 ` [PATCH v2 2/3] app/eventdev: use enqueue new event burst routine pbhagavatula
2023-05-18 15:47 ` [EXT] " Shijith Thotton
2023-04-25 19:51 ` [PATCH v2 3/3] app/eventdev: prevent mempool exhaustion pbhagavatula
2023-05-18 15:47 ` [EXT] " Shijith Thotton
2023-05-18 15:42 ` [PATCH v2 1/3] event/cnxk: use LMTST for enqueue new burst Shijith Thotton
2023-05-22 7:23 ` Jerin Jacob
2023-05-22 11:56 ` [PATCH v3] " pbhagavatula
2023-05-23 6:09 ` Jerin Jacob
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).