DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction
@ 2022-02-24 13:52 pbhagavatula
  2022-02-24 13:52 ` [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model pbhagavatula
  2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
  0 siblings, 2 replies; 7+ messages in thread
From: pbhagavatula @ 2022-02-24 13:52 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

In vWQE mode, the mbuf address is calculated without using the
iova list.
Packet length can also be calculated by using NIX_PARSE_S by
which we can completely eliminate reading 2nd cache line
depending on the offloads enabled.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_rx.h | 75 +++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 20 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index abf280102b..65a08e379b 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -590,7 +590,7 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 							*(uint64_t *)args :
 							rxq->mbuf_initializer;
 	const uint64x2_t data_off = flags & NIX_RX_VWQE_F ?
-						  vdupq_n_u64(0x80ULL) :
+					    vdupq_n_u64(RTE_PKTMBUF_HEADROOM) :
 						  vdupq_n_u64(rxq->data_off);
 	const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask;
 	const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata;
@@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
 
 		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Get NIX_RX_SG_S for size and buffer pointer */
+			cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
+			cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
+			cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
+			cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+
 			/* Extract mbuf from NIX_RX_SG_S */
 			mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
 			mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
@@ -705,21 +711,24 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0);
 		mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1);
 
-		/* Mask to get packet len from NIX_RX_SG_S */
-		const uint8x16_t shuf_msk = {
-			0xFF, 0xFF, /* pkt_type set as unknown */
-			0xFF, 0xFF, /* pkt_type set as unknown */
-			0,    1,    /* octet 1~0, low 16 bits pkt_len */
-			0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
-			0,    1,    /* octet 1~0, 16 bits data_len */
-			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
-
-		/* Form the rx_descriptor_fields1 with pkt_len and data_len */
-		f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
-		f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
-		f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
-		f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
-
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Mask to get packet len from NIX_RX_SG_S */
+			const uint8x16_t shuf_msk = {
+				0xFF, 0xFF, /* pkt_type set as unknown */
+				0xFF, 0xFF, /* pkt_type set as unknown */
+				0,    1,    /* octet 1~0, low 16 bits pkt_len */
+				0xFF, 0xFF, /* skip high 16 bits pkt_len, zero
+					       out */
+				0,    1,    /* octet 1~0, 16 bits data_len */
+				0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+
+			/* Form the rx_descriptor_fields1 with pkt_len and
+			 * data_len */
+			f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
+			f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
+			f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
+			f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
+		}
 		if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 			/* Prefetch probable CPT parse header area */
 			rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off));
@@ -731,12 +740,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		/* Load CQE word0 and word 1 */
 		const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags);
 		const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags);
+		const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags);
 		const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags);
 		const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags);
+		const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags);
 		const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags);
 		const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags);
+		const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags);
 		const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags);
 		const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags);
+		const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags);
+
+		if (flags & NIX_RX_VWQE_F) {
+			uint16_t psize0, psize1, psize2, psize3;
+
+			psize0 = (cq0_w2 & 0xFFFF) + 1;
+			psize1 = (cq1_w2 & 0xFFFF) + 1;
+			psize2 = (cq2_w2 & 0xFFFF) + 1;
+			psize3 = (cq3_w2 & 0xFFFF) + 1;
+
+			f0 = vdupq_n_u64(0);
+			f1 = vdupq_n_u64(0);
+			f2 = vdupq_n_u64(0);
+			f3 = vdupq_n_u64(0);
+
+			f0 = vsetq_lane_u16(psize0, f0, 2);
+			f0 = vsetq_lane_u16(psize0, f0, 4);
+
+			f1 = vsetq_lane_u16(psize1, f1, 2);
+			f1 = vsetq_lane_u16(psize1, f1, 4);
+
+			f2 = vsetq_lane_u16(psize2, f2, 2);
+			f2 = vsetq_lane_u16(psize2, f2, 4);
+
+			f3 = vsetq_lane_u16(psize3, f3, 2);
+			f3 = vsetq_lane_u16(psize3, f3, 4);
+		}
 
 		if (flags & NIX_RX_OFFLOAD_RSS_F) {
 			/* Fill rss in the rx_descriptor_fields1 */
@@ -805,10 +844,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}
 
 		if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) {
-			uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16);
-			uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16);
-			uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16);
-			uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16);
 
 			ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0);
 			ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model
  2022-02-24 13:52 [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
@ 2022-02-24 13:52 ` pbhagavatula
  2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
  1 sibling, 0 replies; 7+ messages in thread
From: pbhagavatula @ 2022-02-24 13:52 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Align perfetchs for CN10K cache model for vWQE in Rx and Tx.
Move mbuf->next NULL assignment to Tx path and enabled it only
when multi segments offload is enabled to reduce L1 pressure.
Add macros to detect corrupted mbuf->next values when
MEMPOOL_DEBUG is set.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h |  13 ++--
 drivers/net/cnxk/cn10k_rx.h       | 115 ++++++++++++++++++++++++------
 drivers/net/cnxk/cn10k_tx.h       |   7 ++
 3 files changed, 107 insertions(+), 28 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index d288c66cac..a827a1e422 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint64_t aura_handle, laddr;
 	uint16_t nb_mbufs, non_vec;
 	uint16_t lmt_id, d_off;
+	struct rte_mbuf **wqe;
 	struct rte_mbuf *mbuf;
 	uint8_t loff = 0;
 	uint64_t sa_base;
-	uint64_t **wqe;
 	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
-	wqe = vec->u64s;
+	wqe = vec->mbufs;
 
-	rte_prefetch_non_temporal(&vec->ptrs[0]);
+	rte_prefetch0(&vec->ptrs[0]);
 #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
 	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
-		rte_prefetch_non_temporal(&vec->ptrs[i]);
+		rte_prefetch0(&vec->ptrs[i]);
 
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
-	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
+	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
 					      tstamp, lbase);
 	wqe += nb_mbufs;
@@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 		cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
 					flags & NIX_RX_OFFLOAD_TSTAMP_F,
 					(uint64_t *)tstamp_ptr);
-		wqe[0] = (uint64_t *)mbuf;
+		wqe[0] = (struct rte_mbuf *)mbuf;
 		non_vec--;
 		wqe++;
 	}
@@ -612,6 +612,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 				ev->sched_type, txq_data, flags);
 		}
 		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
+		rte_prefetch0(ws);
 		return (meta & 0xFFFF);
 	}
 
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 65a08e379b..66a35c69f9 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -36,6 +36,22 @@
 	(((f) & NIX_RX_VWQE_F) ?                                               \
 		       (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \
 		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o)))
+#define CQE_PTR_DIFF(b, i, o, f)                                               \
+	(((f) & NIX_RX_VWQE_F) ?                                               \
+		 (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) :       \
+		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o)))
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#define NIX_MBUF_VALIDATE_NEXT(m)                                              \
+	if (m->nb_segs == 1 && mbuf->next) {                                   \
+		rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d",     \
+			  m->next, m->nb_segs);                                \
+	}
+#else
+#define NIX_MBUF_VALIDATE_NEXT(m)                                              \
+	do {                                                                   \
+	} while (0)
+#endif
 
 union mbuf_initializer {
 	struct {
@@ -674,17 +690,73 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 			cq0 = (uintptr_t)&mbufs[packets];
 		}
 
-		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
-
-		/* Get NIX_RX_SG_S for size and buffer pointer */
-		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
-		cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
-		cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
-		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+		if (flags & NIX_RX_VWQE_F) {
+			if (pkts - packets > 4) {
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 4, 0, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 5, 0, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 6, 0, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 7, 0, flags));
+
+				if (likely(pkts - packets > 8)) {
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 8, 0, flags));
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 9, 0, flags));
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 10, 0, flags));
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 11, 0, flags));
+					if (pkts - packets > 12) {
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 12, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 13, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 14, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 15, 0, flags));
+					}
+				}
+
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 4, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 5, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 6, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 7, RTE_PKTMBUF_HEADROOM, flags));
+				if (likely(pkts - packets > 8)) {
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 8, RTE_PKTMBUF_HEADROOM,
+						flags));
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 9, RTE_PKTMBUF_HEADROOM,
+						flags));
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 10, RTE_PKTMBUF_HEADROOM,
+						flags));
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 11, RTE_PKTMBUF_HEADROOM,
+						flags));
+				}
+			}
+		} else {
+			if (pkts - packets > 4) {
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 4, 64, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 5, 64, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 6, 64, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 7, 64, flags));
+			}
+		}
 
 		if (!(flags & NIX_RX_VWQE_F)) {
 			/* Get NIX_RX_SG_S for size and buffer pointer */
@@ -997,19 +1069,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
 					    (CQE_PTR_OFF(cq0, 3, 8, flags)),
 					    mbuf3, mbuf_initializer, flags);
-		} else {
-			/* Update that no more segments */
-			mbuf0->next = NULL;
-			mbuf1->next = NULL;
-			mbuf2->next = NULL;
-			mbuf3->next = NULL;
 		}
 
-		/* Prefetch mbufs */
-		roc_prefetch_store_keep(mbuf0);
-		roc_prefetch_store_keep(mbuf1);
-		roc_prefetch_store_keep(mbuf2);
-		roc_prefetch_store_keep(mbuf3);
+		/* Mark mempool obj as "get" as it is alloc'ed by NIX */
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1);
+
+		NIX_MBUF_VALIDATE_NEXT(mbuf0);
+		NIX_MBUF_VALIDATE_NEXT(mbuf1);
+		NIX_MBUF_VALIDATE_NEXT(mbuf2);
+		NIX_MBUF_VALIDATE_NEXT(mbuf3);
 
 		packets += NIX_DESCS_PER_LOOP;
 
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index ec6366168c..695e3ed354 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -2569,6 +2569,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 			lnum += 1;
 		}
 
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			tx_pkts[0]->next = NULL;
+			tx_pkts[1]->next = NULL;
+			tx_pkts[2]->next = NULL;
+			tx_pkts[3]->next = NULL;
+		}
+
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction
  2022-02-24 13:52 [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
  2022-02-24 13:52 ` [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model pbhagavatula
@ 2022-02-24 16:10 ` pbhagavatula
  2022-02-24 16:10   ` [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model pbhagavatula
  2022-02-24 18:40   ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj
  1 sibling, 2 replies; 7+ messages in thread
From: pbhagavatula @ 2022-02-24 16:10 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

In vWQE mode, the mbuf address is calculated without using the
iova list.
Packet length can also be calculated by using NIX_PARSE_S by
which we can completely eliminate reading 2nd cache line
depending on the offloads enabled.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Reword commit message.

 drivers/net/cnxk/cn10k_rx.h | 75 +++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 20 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index abf280102b..65a08e379b 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -590,7 +590,7 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 							*(uint64_t *)args :
 							rxq->mbuf_initializer;
 	const uint64x2_t data_off = flags & NIX_RX_VWQE_F ?
-						  vdupq_n_u64(0x80ULL) :
+					    vdupq_n_u64(RTE_PKTMBUF_HEADROOM) :
 						  vdupq_n_u64(rxq->data_off);
 	const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask;
 	const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata;
@@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));

 		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Get NIX_RX_SG_S for size and buffer pointer */
+			cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
+			cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
+			cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
+			cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+
 			/* Extract mbuf from NIX_RX_SG_S */
 			mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
 			mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
@@ -705,21 +711,24 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0);
 		mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1);

-		/* Mask to get packet len from NIX_RX_SG_S */
-		const uint8x16_t shuf_msk = {
-			0xFF, 0xFF, /* pkt_type set as unknown */
-			0xFF, 0xFF, /* pkt_type set as unknown */
-			0,    1,    /* octet 1~0, low 16 bits pkt_len */
-			0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
-			0,    1,    /* octet 1~0, 16 bits data_len */
-			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
-
-		/* Form the rx_descriptor_fields1 with pkt_len and data_len */
-		f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
-		f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
-		f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
-		f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
-
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Mask to get packet len from NIX_RX_SG_S */
+			const uint8x16_t shuf_msk = {
+				0xFF, 0xFF, /* pkt_type set as unknown */
+				0xFF, 0xFF, /* pkt_type set as unknown */
+				0,    1,    /* octet 1~0, low 16 bits pkt_len */
+				0xFF, 0xFF, /* skip high 16 bits pkt_len, zero
+					       out */
+				0,    1,    /* octet 1~0, 16 bits data_len */
+				0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+
+			/* Form the rx_descriptor_fields1 with pkt_len and
+			 * data_len */
+			f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
+			f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
+			f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
+			f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
+		}
 		if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 			/* Prefetch probable CPT parse header area */
 			rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off));
@@ -731,12 +740,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		/* Load CQE word0 and word 1 */
 		const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags);
 		const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags);
+		const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags);
 		const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags);
 		const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags);
+		const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags);
 		const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags);
 		const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags);
+		const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags);
 		const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags);
 		const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags);
+		const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags);
+
+		if (flags & NIX_RX_VWQE_F) {
+			uint16_t psize0, psize1, psize2, psize3;
+
+			psize0 = (cq0_w2 & 0xFFFF) + 1;
+			psize1 = (cq1_w2 & 0xFFFF) + 1;
+			psize2 = (cq2_w2 & 0xFFFF) + 1;
+			psize3 = (cq3_w2 & 0xFFFF) + 1;
+
+			f0 = vdupq_n_u64(0);
+			f1 = vdupq_n_u64(0);
+			f2 = vdupq_n_u64(0);
+			f3 = vdupq_n_u64(0);
+
+			f0 = vsetq_lane_u16(psize0, f0, 2);
+			f0 = vsetq_lane_u16(psize0, f0, 4);
+
+			f1 = vsetq_lane_u16(psize1, f1, 2);
+			f1 = vsetq_lane_u16(psize1, f1, 4);
+
+			f2 = vsetq_lane_u16(psize2, f2, 2);
+			f2 = vsetq_lane_u16(psize2, f2, 4);
+
+			f3 = vsetq_lane_u16(psize3, f3, 2);
+			f3 = vsetq_lane_u16(psize3, f3, 4);
+		}

 		if (flags & NIX_RX_OFFLOAD_RSS_F) {
 			/* Fill rss in the rx_descriptor_fields1 */
@@ -805,10 +844,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}

 		if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) {
-			uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16);
-			uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16);
-			uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16);
-			uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16);

 			ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0);
 			ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1);
--
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model
  2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
@ 2022-02-24 16:10   ` pbhagavatula
  2022-02-24 18:40   ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj
  1 sibling, 0 replies; 7+ messages in thread
From: pbhagavatula @ 2022-02-24 16:10 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Align perfetchs for CN10K cache model for vWQE in Rx and Tx.
Move mbuf->next NULL assignment to Tx path and enabled it only
when multi segments offload is enabled to reduce L1 pressure.
Add macros to detect corrupted mbuf->next values when
MEMPOOL_DEBUG is set.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h |  13 ++--
 drivers/net/cnxk/cn10k_rx.h       | 115 ++++++++++++++++++++++++------
 drivers/net/cnxk/cn10k_tx.h       |   7 ++
 3 files changed, 107 insertions(+), 28 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index d288c66cac..a827a1e422 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint64_t aura_handle, laddr;
 	uint16_t nb_mbufs, non_vec;
 	uint16_t lmt_id, d_off;
+	struct rte_mbuf **wqe;
 	struct rte_mbuf *mbuf;
 	uint8_t loff = 0;
 	uint64_t sa_base;
-	uint64_t **wqe;
 	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
-	wqe = vec->u64s;
+	wqe = vec->mbufs;
 
-	rte_prefetch_non_temporal(&vec->ptrs[0]);
+	rte_prefetch0(&vec->ptrs[0]);
 #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
 	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
-		rte_prefetch_non_temporal(&vec->ptrs[i]);
+		rte_prefetch0(&vec->ptrs[i]);
 
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
-	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
+	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
 					      tstamp, lbase);
 	wqe += nb_mbufs;
@@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 		cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
 					flags & NIX_RX_OFFLOAD_TSTAMP_F,
 					(uint64_t *)tstamp_ptr);
-		wqe[0] = (uint64_t *)mbuf;
+		wqe[0] = (struct rte_mbuf *)mbuf;
 		non_vec--;
 		wqe++;
 	}
@@ -612,6 +612,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 				ev->sched_type, txq_data, flags);
 		}
 		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
+		rte_prefetch0(ws);
 		return (meta & 0xFFFF);
 	}
 
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 65a08e379b..66a35c69f9 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -36,6 +36,22 @@
 	(((f) & NIX_RX_VWQE_F) ?                                               \
 		       (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \
 		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o)))
+#define CQE_PTR_DIFF(b, i, o, f)                                               \
+	(((f) & NIX_RX_VWQE_F) ?                                               \
+		 (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) :       \
+		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o)))
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#define NIX_MBUF_VALIDATE_NEXT(m)                                              \
+	if (m->nb_segs == 1 && mbuf->next) {                                   \
+		rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d",     \
+			  m->next, m->nb_segs);                                \
+	}
+#else
+#define NIX_MBUF_VALIDATE_NEXT(m)                                              \
+	do {                                                                   \
+	} while (0)
+#endif
 
 union mbuf_initializer {
 	struct {
@@ -674,17 +690,73 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 			cq0 = (uintptr_t)&mbufs[packets];
 		}
 
-		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
-
-		/* Get NIX_RX_SG_S for size and buffer pointer */
-		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
-		cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
-		cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
-		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+		if (flags & NIX_RX_VWQE_F) {
+			if (pkts - packets > 4) {
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 4, 0, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 5, 0, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 6, 0, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 7, 0, flags));
+
+				if (likely(pkts - packets > 8)) {
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 8, 0, flags));
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 9, 0, flags));
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 10, 0, flags));
+					rte_prefetch1(
+						CQE_PTR_OFF(cq0, 11, 0, flags));
+					if (pkts - packets > 12) {
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 12, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 13, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 14, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(
+							cq0, 15, 0, flags));
+					}
+				}
+
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 4, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 5, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 6, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(
+					cq0, 7, RTE_PKTMBUF_HEADROOM, flags));
+				if (likely(pkts - packets > 8)) {
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 8, RTE_PKTMBUF_HEADROOM,
+						flags));
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 9, RTE_PKTMBUF_HEADROOM,
+						flags));
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 10, RTE_PKTMBUF_HEADROOM,
+						flags));
+					rte_prefetch0(CQE_PTR_DIFF(
+						cq0, 11, RTE_PKTMBUF_HEADROOM,
+						flags));
+				}
+			}
+		} else {
+			if (pkts - packets > 4) {
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 4, 64, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 5, 64, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 6, 64, flags));
+				rte_prefetch_non_temporal(
+					CQE_PTR_OFF(cq0, 7, 64, flags));
+			}
+		}
 
 		if (!(flags & NIX_RX_VWQE_F)) {
 			/* Get NIX_RX_SG_S for size and buffer pointer */
@@ -997,19 +1069,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
 					    (CQE_PTR_OFF(cq0, 3, 8, flags)),
 					    mbuf3, mbuf_initializer, flags);
-		} else {
-			/* Update that no more segments */
-			mbuf0->next = NULL;
-			mbuf1->next = NULL;
-			mbuf2->next = NULL;
-			mbuf3->next = NULL;
 		}
 
-		/* Prefetch mbufs */
-		roc_prefetch_store_keep(mbuf0);
-		roc_prefetch_store_keep(mbuf1);
-		roc_prefetch_store_keep(mbuf2);
-		roc_prefetch_store_keep(mbuf3);
+		/* Mark mempool obj as "get" as it is alloc'ed by NIX */
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1);
+
+		NIX_MBUF_VALIDATE_NEXT(mbuf0);
+		NIX_MBUF_VALIDATE_NEXT(mbuf1);
+		NIX_MBUF_VALIDATE_NEXT(mbuf2);
+		NIX_MBUF_VALIDATE_NEXT(mbuf3);
 
 		packets += NIX_DESCS_PER_LOOP;
 
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index ec6366168c..695e3ed354 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -2569,6 +2569,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 			lnum += 1;
 		}
 
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			tx_pkts[0]->next = NULL;
+			tx_pkts[1]->next = NULL;
+			tx_pkts[2]->next = NULL;
+			tx_pkts[3]->next = NULL;
+		}
+
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction
  2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
  2022-02-24 16:10   ` [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model pbhagavatula
@ 2022-02-24 18:40   ` jerinj
  2022-02-24 18:40     ` [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model jerinj
  2022-02-24 20:39     ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction Jerin Jacob
  1 sibling, 2 replies; 7+ messages in thread
From: jerinj @ 2022-02-24 18:40 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: Pavan Nikhilesh, Jerin Jacob

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

In vWQE mode, the mbuf address is calculated without using the
iova list.

Packet length can also be calculated by using NIX_PARSE_S by
which we can completely eliminate reading 2nd cache line
depending on the offloads enabled.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
v3:
- Change NIX_MBUF_VALIDATE_NEXT macro to inline function
- Fixed the relevant checkpatch warning at
http://mails.dpdk.org/archives/test-report/2022-February/264235.html

v2 :
 - Reword commit message.

 drivers/net/cnxk/cn10k_rx.h | 73 +++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index abf280102b..236a1dca6e 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -590,8 +590,8 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 							*(uint64_t *)args :
 							rxq->mbuf_initializer;
 	const uint64x2_t data_off = flags & NIX_RX_VWQE_F ?
-						  vdupq_n_u64(0x80ULL) :
-						  vdupq_n_u64(rxq->data_off);
+					vdupq_n_u64(RTE_PKTMBUF_HEADROOM) :
+					vdupq_n_u64(rxq->data_off);
 	const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask;
 	const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata;
 	const uintptr_t desc = flags & NIX_RX_VWQE_F ? 0 : rxq->desc;
@@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
 
 		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Get NIX_RX_SG_S for size and buffer pointer */
+			cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
+			cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
+			cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
+			cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+
 			/* Extract mbuf from NIX_RX_SG_S */
 			mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
 			mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
@@ -705,21 +711,22 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0);
 		mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1);
 
-		/* Mask to get packet len from NIX_RX_SG_S */
-		const uint8x16_t shuf_msk = {
-			0xFF, 0xFF, /* pkt_type set as unknown */
-			0xFF, 0xFF, /* pkt_type set as unknown */
-			0,    1,    /* octet 1~0, low 16 bits pkt_len */
-			0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
-			0,    1,    /* octet 1~0, 16 bits data_len */
-			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
-
-		/* Form the rx_descriptor_fields1 with pkt_len and data_len */
-		f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
-		f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
-		f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
-		f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Mask to get packet len from NIX_RX_SG_S */
+			const uint8x16_t shuf_msk = {
+				0xFF, 0xFF, /* pkt_type set as unknown */
+				0xFF, 0xFF, /* pkt_type set as unknown */
+				0,    1,    /* octet 1~0, low 16 bits pkt_len */
+				0xFF, 0xFF, /* skip high 16it pkt_len, zero out */
+				0,    1,    /* octet 1~0, 16 bits data_len */
+				0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
 
+			/* Form the rx_descriptor_fields1 with pkt_len and data_len */
+			f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
+			f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
+			f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
+			f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
+		}
 		if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 			/* Prefetch probable CPT parse header area */
 			rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off));
@@ -731,12 +738,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		/* Load CQE word0 and word 1 */
 		const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags);
 		const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags);
+		const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags);
 		const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags);
 		const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags);
+		const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags);
 		const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags);
 		const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags);
+		const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags);
 		const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags);
 		const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags);
+		const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags);
+
+		if (flags & NIX_RX_VWQE_F) {
+			uint16_t psize0, psize1, psize2, psize3;
+
+			psize0 = (cq0_w2 & 0xFFFF) + 1;
+			psize1 = (cq1_w2 & 0xFFFF) + 1;
+			psize2 = (cq2_w2 & 0xFFFF) + 1;
+			psize3 = (cq3_w2 & 0xFFFF) + 1;
+
+			f0 = vdupq_n_u64(0);
+			f1 = vdupq_n_u64(0);
+			f2 = vdupq_n_u64(0);
+			f3 = vdupq_n_u64(0);
+
+			f0 = vsetq_lane_u16(psize0, f0, 2);
+			f0 = vsetq_lane_u16(psize0, f0, 4);
+
+			f1 = vsetq_lane_u16(psize1, f1, 2);
+			f1 = vsetq_lane_u16(psize1, f1, 4);
+
+			f2 = vsetq_lane_u16(psize2, f2, 2);
+			f2 = vsetq_lane_u16(psize2, f2, 4);
+
+			f3 = vsetq_lane_u16(psize3, f3, 2);
+			f3 = vsetq_lane_u16(psize3, f3, 4);
+		}
 
 		if (flags & NIX_RX_OFFLOAD_RSS_F) {
 			/* Fill rss in the rx_descriptor_fields1 */
@@ -805,10 +842,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}
 
 		if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) {
-			uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16);
-			uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16);
-			uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16);
-			uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16);
 
 			ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0);
 			ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1);
-- 
2.35.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model
  2022-02-24 18:40   ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj
@ 2022-02-24 18:40     ` jerinj
  2022-02-24 20:39     ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction Jerin Jacob
  1 sibling, 0 replies; 7+ messages in thread
From: jerinj @ 2022-02-24 18:40 UTC (permalink / raw)
  To: dev, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: Jerin Jacob

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Align perfetchs for CN10K cache model for vWQE in Rx and Tx.
Move mbuf->next NULL assignment to Tx path and enabled it only
when multi segments offload is enabled to reduce L1 pressure.
Add macros to detect corrupted mbuf->next values when
MEMPOOL_DEBUG is set.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h |  13 ++--
 drivers/net/cnxk/cn10k_rx.h       | 111 ++++++++++++++++++++++++------
 drivers/net/cnxk/cn10k_tx.h       |   7 ++
 3 files changed, 104 insertions(+), 27 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index d288c66cac..a827a1e422 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint64_t aura_handle, laddr;
 	uint16_t nb_mbufs, non_vec;
 	uint16_t lmt_id, d_off;
+	struct rte_mbuf **wqe;
 	struct rte_mbuf *mbuf;
 	uint8_t loff = 0;
 	uint64_t sa_base;
-	uint64_t **wqe;
 	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
-	wqe = vec->u64s;
+	wqe = vec->mbufs;
 
-	rte_prefetch_non_temporal(&vec->ptrs[0]);
+	rte_prefetch0(&vec->ptrs[0]);
 #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
 	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
-		rte_prefetch_non_temporal(&vec->ptrs[i]);
+		rte_prefetch0(&vec->ptrs[i]);
 
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
-	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
+	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
 					      tstamp, lbase);
 	wqe += nb_mbufs;
@@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 		cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
 					flags & NIX_RX_OFFLOAD_TSTAMP_F,
 					(uint64_t *)tstamp_ptr);
-		wqe[0] = (uint64_t *)mbuf;
+		wqe[0] = (struct rte_mbuf *)mbuf;
 		non_vec--;
 		wqe++;
 	}
@@ -612,6 +612,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 				ev->sched_type, txq_data, flags);
 		}
 		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
+		rte_prefetch0(ws);
 		return (meta & 0xFFFF);
 	}
 
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 236a1dca6e..de5e41483b 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -36,6 +36,27 @@
 	(((f) & NIX_RX_VWQE_F) ?                                               \
 		       (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \
 		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o)))
+#define CQE_PTR_DIFF(b, i, o, f)                                               \
+	(((f) & NIX_RX_VWQE_F) ?                                               \
+		 (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) :       \
+		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o)))
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+static inline void
+nix_mbuf_validate_next(struct rte_mbuf *m)
+{
+	if (m->nb_segs == 1 && m->next) {
+		rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d",
+			m->next, m->nb_segs);
+	}
+}
+#else
+static inline void
+nix_mbuf_validate_next(struct rte_mbuf *m)
+{
+	RTE_SET_USED(m);
+}
+#endif
 
 union mbuf_initializer {
 	struct {
@@ -674,17 +695,66 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 			cq0 = (uintptr_t)&mbufs[packets];
 		}
 
-		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
+		if (flags & NIX_RX_VWQE_F) {
+			if (pkts - packets > 4) {
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0,
+					4, 0, flags));
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0,
+					5, 0, flags));
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0,
+					6, 0, flags));
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0,
+					7, 0, flags));
 
-		/* Get NIX_RX_SG_S for size and buffer pointer */
-		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
-		cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
-		cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
-		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+				if (likely(pkts - packets > 8)) {
+					rte_prefetch1(CQE_PTR_OFF(cq0,
+						8, 0, flags));
+					rte_prefetch1(CQE_PTR_OFF(cq0,
+						9, 0, flags));
+					rte_prefetch1(CQE_PTR_OFF(cq0,
+						10, 0, flags));
+					rte_prefetch1(CQE_PTR_OFF(cq0,
+						11, 0, flags));
+					if (pkts - packets > 12) {
+						rte_prefetch1(CQE_PTR_OFF(cq0,
+							12, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(cq0,
+							13, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(cq0,
+							14, 0, flags));
+						rte_prefetch1(CQE_PTR_OFF(cq0,
+							15, 0, flags));
+					}
+				}
+
+				rte_prefetch0(CQE_PTR_DIFF(cq0,
+					4, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(cq0,
+					5, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(cq0,
+					6, RTE_PKTMBUF_HEADROOM, flags));
+				rte_prefetch0(CQE_PTR_DIFF(cq0,
+					7, RTE_PKTMBUF_HEADROOM, flags));
+
+				if (likely(pkts - packets > 8)) {
+					rte_prefetch0(CQE_PTR_DIFF(cq0,
+						8, RTE_PKTMBUF_HEADROOM, flags));
+					rte_prefetch0(CQE_PTR_DIFF(cq0,
+						9, RTE_PKTMBUF_HEADROOM, flags));
+					rte_prefetch0(CQE_PTR_DIFF(cq0,
+						10, RTE_PKTMBUF_HEADROOM, flags));
+					rte_prefetch0(CQE_PTR_DIFF(cq0,
+						11, RTE_PKTMBUF_HEADROOM, flags));
+				}
+			}
+		} else {
+			if (pkts - packets > 4) {
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+				rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
+			}
+		}
 
 		if (!(flags & NIX_RX_VWQE_F)) {
 			/* Get NIX_RX_SG_S for size and buffer pointer */
@@ -995,19 +1065,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
 					    (CQE_PTR_OFF(cq0, 3, 8, flags)),
 					    mbuf3, mbuf_initializer, flags);
-		} else {
-			/* Update that no more segments */
-			mbuf0->next = NULL;
-			mbuf1->next = NULL;
-			mbuf2->next = NULL;
-			mbuf3->next = NULL;
 		}
 
-		/* Prefetch mbufs */
-		roc_prefetch_store_keep(mbuf0);
-		roc_prefetch_store_keep(mbuf1);
-		roc_prefetch_store_keep(mbuf2);
-		roc_prefetch_store_keep(mbuf3);
+		/* Mark mempool obj as "get" as it is alloc'ed by NIX */
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1);
+		RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1);
+
+		nix_mbuf_validate_next(mbuf0);
+		nix_mbuf_validate_next(mbuf1);
+		nix_mbuf_validate_next(mbuf2);
+		nix_mbuf_validate_next(mbuf3);
 
 		packets += NIX_DESCS_PER_LOOP;
 
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index ec6366168c..695e3ed354 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -2569,6 +2569,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 			lnum += 1;
 		}
 
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			tx_pkts[0]->next = NULL;
+			tx_pkts[1]->next = NULL;
+			tx_pkts[2]->next = NULL;
+			tx_pkts[3]->next = NULL;
+		}
+
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
-- 
2.35.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction
  2022-02-24 18:40   ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj
  2022-02-24 18:40     ` [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model jerinj
@ 2022-02-24 20:39     ` Jerin Jacob
  1 sibling, 0 replies; 7+ messages in thread
From: Jerin Jacob @ 2022-02-24 20:39 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dpdk-dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh

On Fri, Feb 25, 2022 at 12:09 AM <jerinj@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> In vWQE mode, the mbuf address is calculated without using the
> iova list.
>
> Packet length can also be calculated by using NIX_PARSE_S by
> which we can completely eliminate reading 2nd cache line
> depending on the offloads enabled.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> ---
> v3:
> - Change NIX_MBUF_VALIDATE_NEXT macro to inline function
> - Fixed the relevant checkpatch warning at
> http://mails.dpdk.org/archives/test-report/2022-February/264235.html

Series applied to dpdk-next-net-mrvl/for-next-net. Thanks.


>
> v2 :
>  - Reword commit message.
>
>  drivers/net/cnxk/cn10k_rx.h | 73 +++++++++++++++++++++++++++----------
>  1 file changed, 53 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
> index abf280102b..236a1dca6e 100644
> --- a/drivers/net/cnxk/cn10k_rx.h
> +++ b/drivers/net/cnxk/cn10k_rx.h
> @@ -590,8 +590,8 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
>                                                         *(uint64_t *)args :
>                                                         rxq->mbuf_initializer;
>         const uint64x2_t data_off = flags & NIX_RX_VWQE_F ?
> -                                                 vdupq_n_u64(0x80ULL) :
> -                                                 vdupq_n_u64(rxq->data_off);
> +                                       vdupq_n_u64(RTE_PKTMBUF_HEADROOM) :
> +                                       vdupq_n_u64(rxq->data_off);
>         const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask;
>         const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata;
>         const uintptr_t desc = flags & NIX_RX_VWQE_F ? 0 : rxq->desc;
> @@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
>                 cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
>
>                 if (!(flags & NIX_RX_VWQE_F)) {
> +                       /* Get NIX_RX_SG_S for size and buffer pointer */
> +                       cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
> +                       cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
> +                       cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
> +                       cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
> +
>                         /* Extract mbuf from NIX_RX_SG_S */
>                         mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
>                         mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
> @@ -705,21 +711,22 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
>                 mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0);
>                 mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1);
>
> -               /* Mask to get packet len from NIX_RX_SG_S */
> -               const uint8x16_t shuf_msk = {
> -                       0xFF, 0xFF, /* pkt_type set as unknown */
> -                       0xFF, 0xFF, /* pkt_type set as unknown */
> -                       0,    1,    /* octet 1~0, low 16 bits pkt_len */
> -                       0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
> -                       0,    1,    /* octet 1~0, 16 bits data_len */
> -                       0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
> -
> -               /* Form the rx_descriptor_fields1 with pkt_len and data_len */
> -               f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
> -               f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
> -               f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
> -               f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
> +               if (!(flags & NIX_RX_VWQE_F)) {
> +                       /* Mask to get packet len from NIX_RX_SG_S */
> +                       const uint8x16_t shuf_msk = {
> +                               0xFF, 0xFF, /* pkt_type set as unknown */
> +                               0xFF, 0xFF, /* pkt_type set as unknown */
> +                               0,    1,    /* octet 1~0, low 16 bits pkt_len */
> +                               0xFF, 0xFF, /* skip high 16it pkt_len, zero out */
> +                               0,    1,    /* octet 1~0, 16 bits data_len */
> +                               0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
>
> +                       /* Form the rx_descriptor_fields1 with pkt_len and data_len */
> +                       f0 = vqtbl1q_u8(cq0_w8, shuf_msk);
> +                       f1 = vqtbl1q_u8(cq1_w8, shuf_msk);
> +                       f2 = vqtbl1q_u8(cq2_w8, shuf_msk);
> +                       f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
> +               }
>                 if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
>                         /* Prefetch probable CPT parse header area */
>                         rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off));
> @@ -731,12 +738,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
>                 /* Load CQE word0 and word 1 */
>                 const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags);
>                 const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags);
> +               const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags);
>                 const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags);
>                 const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags);
> +               const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags);
>                 const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags);
>                 const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags);
> +               const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags);
>                 const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags);
>                 const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags);
> +               const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags);
> +
> +               if (flags & NIX_RX_VWQE_F) {
> +                       uint16_t psize0, psize1, psize2, psize3;
> +
> +                       psize0 = (cq0_w2 & 0xFFFF) + 1;
> +                       psize1 = (cq1_w2 & 0xFFFF) + 1;
> +                       psize2 = (cq2_w2 & 0xFFFF) + 1;
> +                       psize3 = (cq3_w2 & 0xFFFF) + 1;
> +
> +                       f0 = vdupq_n_u64(0);
> +                       f1 = vdupq_n_u64(0);
> +                       f2 = vdupq_n_u64(0);
> +                       f3 = vdupq_n_u64(0);
> +
> +                       f0 = vsetq_lane_u16(psize0, f0, 2);
> +                       f0 = vsetq_lane_u16(psize0, f0, 4);
> +
> +                       f1 = vsetq_lane_u16(psize1, f1, 2);
> +                       f1 = vsetq_lane_u16(psize1, f1, 4);
> +
> +                       f2 = vsetq_lane_u16(psize2, f2, 2);
> +                       f2 = vsetq_lane_u16(psize2, f2, 4);
> +
> +                       f3 = vsetq_lane_u16(psize3, f3, 2);
> +                       f3 = vsetq_lane_u16(psize3, f3, 4);
> +               }
>
>                 if (flags & NIX_RX_OFFLOAD_RSS_F) {
>                         /* Fill rss in the rx_descriptor_fields1 */
> @@ -805,10 +842,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
>                 }
>
>                 if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) {
> -                       uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16);
> -                       uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16);
> -                       uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16);
> -                       uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16);
>
>                         ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0);
>                         ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1);
> --
> 2.35.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-02-24 20:40 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-24 13:52 [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
2022-02-24 13:52 ` [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model pbhagavatula
2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula
2022-02-24 16:10   ` [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model pbhagavatula
2022-02-24 18:40   ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj
2022-02-24 18:40     ` [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model jerinj
2022-02-24 20:39     ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).