DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v3 1/2] net/octeon_ep: improve Rx performance
@ 2024-01-21 16:43 pbhagavatula
  2024-01-21 16:43 ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
  2024-02-01 22:23 ` [PATCH v2 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  0 siblings, 2 replies; 16+ messages in thread
From: pbhagavatula @ 2024-01-21 16:43 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 5 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;
 
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);
 
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;
 
-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}
 
 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;
 
 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }
 
+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));
 
 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);
 
 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;
 
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
 
-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }
 
@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;
 
-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}
 
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@ static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;
 
 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];
 
 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }
 
 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;
 
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
 
-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }
 
@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;
 
-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}
 
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@
 
 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine
  2024-01-21 16:43 [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
@ 2024-01-21 16:43 ` pbhagavatula
  2024-02-01 16:38   ` Jerin Jacob
  2024-02-01 22:23 ` [PATCH v2 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 1 reply; 16+ messages in thread
From: pbhagavatula @ 2024-01-21 16:43 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx ARM NEON SIMD routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 140 ++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build       |   6 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
 drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
 4 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_neon.c b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
new file mode 100644
index 0000000000..b13a5897f9
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+			      uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t pidx0, pidx1, pidx2, pidx3;
+	struct rte_mbuf *m0, *m1, *m2, *m3;
+	uint32_t read_idx = droq->read_idx;
+	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
+	uint32x4_t bytes;
+	uint16_t pkts = 0;
+
+	idx0 = read_idx;
+	bytes = vdupq_n_u32(0);
+	while (pkts < new_pkts) {
+		const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
+					  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
+		const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
+					  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
+		uint64x2_t s01, s23;
+
+		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
+		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
+		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
+
+		if (new_pkts - pkts > 4) {
+			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
+			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
+			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
+
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
+		}
+
+		m0 = recv_buf_list[idx0];
+		m1 = recv_buf_list[idx1];
+		m2 = recv_buf_list[idx2];
+		m3 = recv_buf_list[idx3];
+
+		/* Load packet size big-endian. */
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 0);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 1);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 2);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 3);
+		/* Convert to little-endian. */
+		s01 = vrev16q_u8(s01);
+
+		/* Vertical add, consolidate outside the loop. */
+		bytes += vaddq_u32(bytes, s01);
+		/* Segregate to packet length and data length. */
+		s23 = vqtbl1q_u8(s01, mask1);
+		s01 = vqtbl1q_u8(s01, mask0);
+
+		/* Store packet length and data length to mbuf. */
+		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
+		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
+		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
+		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
+
+		/* Reset rearm data. */
+		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
+
+		rx_pkts[pkts++] = m0;
+		rx_pkts[pkts++] = m1;
+		rx_pkts[pkts++] = m2;
+		rx_pkts[pkts++] = m3;
+		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+	}
+	droq->read_idx = idx0;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += vaddvq_u32(bytes);
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e8ae56018d..d5d40b23a1 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -29,7 +29,11 @@ if arch_subdir == 'x86'
     endif
 endif
 
-extra_flags = ['-Wno-strict-aliasing']
+if arch_subdir == 'arm'
+    sources += files('cnxk_ep_rx_neon.c')
+endif
+
+extra_flags = ['-Wno-strict-aliasing', '-flax-vector-conversions']
 foreach flag: extra_flags
     if cc.has_argument(flag)
         cflags += flag
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 42a97ea110..8daa7d225c 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -59,6 +59,8 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_neon;
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -71,8 +73,9 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_neon;
 #endif
-
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
 	} else {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 8f306bd94e..f5bc807dc0 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -60,12 +60,18 @@ cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budge
 uint16_t
 cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
 uint16_t
 cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine
  2024-01-21 16:43 ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-01 16:38   ` Jerin Jacob
  0 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2024-02-01 16:38 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Ruifeng Wang, Vamsi Attunuru, dev

On Sun, Jan 21, 2024 at 10:13 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add Rx ARM NEON SIMD routine.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Please fix https://mails.dpdk.org/archives/test-report/2024-January/559746.html
https://patches.dpdk.org/project/dpdk/patch/20240121164334.9269-2-pbhagavatula@marvell.com/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/2] net/octeon_ep: improve Rx performance
  2024-01-21 16:43 [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  2024-01-21 16:43 ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-01 22:23 ` pbhagavatula
  2024-02-01 22:23   ` [PATCH v2 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
  2024-02-02  8:43   ` [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 2 replies; 16+ messages in thread
From: pbhagavatula @ 2024-02-01 22:23 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation.

 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 5 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;

 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);

+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;

-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}

 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;

 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }

+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));

 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);

 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@ static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;

 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];

 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }

 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@

 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)

 static inline uint32_t
--
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-01 22:23 ` [PATCH v2 1/2] net/octeon_ep: improve Rx performance pbhagavatula
@ 2024-02-01 22:23   ` pbhagavatula
  2024-02-02  8:11     ` Jerin Jacob
  2024-02-02  8:43   ` [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 1 reply; 16+ messages in thread
From: pbhagavatula @ 2024-02-01 22:23 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx ARM NEON SIMD routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 141 ++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build       |   6 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
 drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
 4 files changed, 156 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_neon.c b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
new file mode 100644
index 0000000000..1f6f27689b
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+			      uint16_t new_pkts)
+{
+	const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
+				  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
+	const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
+				  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t pidx0, pidx1, pidx2, pidx3;
+	struct rte_mbuf *m0, *m1, *m2, *m3;
+	uint32_t read_idx = droq->read_idx;
+	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
+	uint64x2_t s01, s23;
+	uint32x4_t bytes;
+	uint16_t pkts = 0;
+
+	idx0 = read_idx;
+	s01 = vdupq_n_u64(0);
+	bytes = vdupq_n_u32(0);
+	while (pkts < new_pkts) {
+
+		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
+		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
+		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
+
+		if (new_pkts - pkts > 4) {
+			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
+			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
+			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
+
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
+		}
+
+		m0 = recv_buf_list[idx0];
+		m1 = recv_buf_list[idx1];
+		m2 = recv_buf_list[idx2];
+		m3 = recv_buf_list[idx3];
+
+		/* Load packet size big-endian. */
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 0);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 1);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 2);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 3);
+		/* Convert to little-endian. */
+		s01 = vrev16q_u8(s01);
+
+		/* Vertical add, consolidate outside the loop. */
+		bytes += vaddq_u32(bytes, s01);
+		/* Segregate to packet length and data length. */
+		s23 = vqtbl1q_u8(s01, mask1);
+		s01 = vqtbl1q_u8(s01, mask0);
+
+		/* Store packet length and data length to mbuf. */
+		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
+		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
+		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
+		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
+
+		/* Reset rearm data. */
+		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
+
+		rx_pkts[pkts++] = m0;
+		rx_pkts[pkts++] = m1;
+		rx_pkts[pkts++] = m2;
+		rx_pkts[pkts++] = m3;
+		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+	}
+	droq->read_idx = idx0;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += vaddvq_u32(bytes);
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e8ae56018d..d5d40b23a1 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -29,7 +29,11 @@ if arch_subdir == 'x86'
     endif
 endif
 
-extra_flags = ['-Wno-strict-aliasing']
+if arch_subdir == 'arm'
+    sources += files('cnxk_ep_rx_neon.c')
+endif
+
+extra_flags = ['-Wno-strict-aliasing', '-flax-vector-conversions']
 foreach flag: extra_flags
     if cc.has_argument(flag)
         cflags += flag
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 42a97ea110..8daa7d225c 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -59,6 +59,8 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_neon;
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -71,8 +73,9 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_neon;
 #endif
-
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
 	} else {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 8f306bd94e..f5bc807dc0 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -60,12 +60,18 @@ cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budge
 uint16_t
 cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
 uint16_t
 cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-01 22:23   ` [PATCH v2 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-02  8:11     ` Jerin Jacob
  0 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2024-02-02  8:11 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Ruifeng Wang, Vamsi Attunuru, dev

On Fri, Feb 2, 2024 at 7:29 AM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add Rx ARM NEON SIMD routine.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Please fix https://mails.dpdk.org/archives/test-report/2024-February/568395.html

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 1/2] net/octeon_ep: improve Rx performance
  2024-02-01 22:23 ` [PATCH v2 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  2024-02-01 22:23   ` [PATCH v2 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-02  8:43   ` pbhagavatula
  2024-02-02  8:43     ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
  2024-02-02 11:31     ` [PATCH v4 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 2 replies; 16+ messages in thread
From: pbhagavatula @ 2024-02-02  8:43 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation with distro gcc.
 v3 Changes:
 - Fix aarch32 compilation.

 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 5 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;

 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);

+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;

-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}

 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;

 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }

+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));

 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);

 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@ static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;

 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];

 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }

 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@

 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)

 static inline uint32_t
--
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-02  8:43   ` [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
@ 2024-02-02  8:43     ` pbhagavatula
  2024-02-02 11:09       ` Jerin Jacob
  2024-02-02 11:31     ` [PATCH v4 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 1 reply; 16+ messages in thread
From: pbhagavatula @ 2024-02-02  8:43 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx ARM NEON SIMD routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 148 ++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build       |   6 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
 drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
 4 files changed, 163 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_neon.c b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
new file mode 100644
index 0000000000..8abd8711e1
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+			      uint16_t new_pkts)
+{
+	const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
+				  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
+	const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
+				  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t pidx0, pidx1, pidx2, pidx3;
+	struct rte_mbuf *m0, *m1, *m2, *m3;
+	uint32_t read_idx = droq->read_idx;
+	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
+	uint64x2_t s01, s23;
+	uint32x4_t bytes;
+	uint16_t pkts = 0;
+
+	idx0 = read_idx;
+	s01 = vdupq_n_u64(0);
+	bytes = vdupq_n_u32(0);
+	while (pkts < new_pkts) {
+
+		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
+		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
+		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
+
+		if (new_pkts - pkts > 4) {
+			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
+			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
+			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
+
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
+		}
+
+		m0 = recv_buf_list[idx0];
+		m1 = recv_buf_list[idx1];
+		m2 = recv_buf_list[idx2];
+		m3 = recv_buf_list[idx3];
+
+		/* Load packet size big-endian. */
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 0);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 1);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 2);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 3);
+		/* Convert to little-endian. */
+		s01 = vrev16q_u8(s01);
+
+		/* Vertical add, consolidate outside the loop. */
+		bytes += vaddq_u32(bytes, s01);
+		/* Segregate to packet length and data length. */
+		s23 = vqtbl1q_u8(s01, mask1);
+		s01 = vqtbl1q_u8(s01, mask0);
+
+		/* Store packet length and data length to mbuf. */
+		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
+		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
+		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
+		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
+
+		/* Reset rearm data. */
+		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
+
+		rx_pkts[pkts++] = m0;
+		rx_pkts[pkts++] = m1;
+		rx_pkts[pkts++] = m2;
+		rx_pkts[pkts++] = m3;
+		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+	}
+	droq->read_idx = idx0;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+#if defined(RTE_ARCH_32)
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 0);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 1);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 2);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 3);
+#else
+	droq->stats.bytes_received += vaddvq_u32(bytes);
+#endif
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e8ae56018d..d5d40b23a1 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -29,7 +29,11 @@ if arch_subdir == 'x86'
     endif
 endif
 
-extra_flags = ['-Wno-strict-aliasing']
+if arch_subdir == 'arm'
+    sources += files('cnxk_ep_rx_neon.c')
+endif
+
+extra_flags = ['-Wno-strict-aliasing', '-flax-vector-conversions']
 foreach flag: extra_flags
     if cc.has_argument(flag)
         cflags += flag
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 42a97ea110..8daa7d225c 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -59,6 +59,8 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_neon;
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -71,8 +73,9 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_neon;
 #endif
-
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
 	} else {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 8f306bd94e..f5bc807dc0 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -60,12 +60,18 @@ cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budge
 uint16_t
 cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
 uint16_t
 cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-02  8:43     ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-02 11:09       ` Jerin Jacob
  0 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2024-02-02 11:09 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Ruifeng Wang, Vamsi Attunuru, dev

On Fri, Feb 2, 2024 at 2:54 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add Rx ARM NEON SIMD routine.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 148 ++++++++++++++++++++++++
>  drivers/net/octeon_ep/meson.build       |   6 +-
>  drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
>  drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
>  4 files changed, 163 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

Please fix


### [PATCH] net/octeon_ep: add Rx NEON routine

CHECK:BRACES: Blank lines aren't necessary after an open brace '{'
#44: FILE: drivers/net/octeon_ep/cnxk_ep_rx_neon.c:29:
+       while (pkts < new_pkts) {
+

total: 0 errors, 0 warnings, 1 checks, 189 lines checked

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4 1/2] net/octeon_ep: improve Rx performance
  2024-02-02  8:43   ` [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  2024-02-02  8:43     ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-02 11:31     ` pbhagavatula
  2024-02-02 11:31       ` [PATCH v4 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
  2024-02-02 15:06       ` [PATCH v5 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 2 replies; 16+ messages in thread
From: pbhagavatula @ 2024-02-02 11:31 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation with distro gcc.
 v3 Changes:
 - Fix aarch32 compilation.
 v4 Changes:
 - Fix checkpatch.

 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 5 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;

 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);

+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;

-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}

 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;

 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }

+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));

 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);

 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@ static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;

 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];

 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }

 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@

 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)

 static inline uint32_t
--
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-02 11:31     ` [PATCH v4 1/2] net/octeon_ep: improve Rx performance pbhagavatula
@ 2024-02-02 11:31       ` pbhagavatula
  2024-02-02 15:06       ` [PATCH v5 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2024-02-02 11:31 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx ARM NEON SIMD routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 147 ++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build       |   6 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
 drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
 4 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_neon.c b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
new file mode 100644
index 0000000000..4c46a7ea08
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+			      uint16_t new_pkts)
+{
+	const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
+				  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
+	const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
+				  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t pidx0, pidx1, pidx2, pidx3;
+	struct rte_mbuf *m0, *m1, *m2, *m3;
+	uint32_t read_idx = droq->read_idx;
+	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
+	uint64x2_t s01, s23;
+	uint32x4_t bytes;
+	uint16_t pkts = 0;
+
+	idx0 = read_idx;
+	s01 = vdupq_n_u64(0);
+	bytes = vdupq_n_u32(0);
+	while (pkts < new_pkts) {
+		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
+		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
+		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
+
+		if (new_pkts - pkts > 4) {
+			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
+			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
+			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
+
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
+		}
+
+		m0 = recv_buf_list[idx0];
+		m1 = recv_buf_list[idx1];
+		m2 = recv_buf_list[idx2];
+		m3 = recv_buf_list[idx3];
+
+		/* Load packet size big-endian. */
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 0);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 1);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 2);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 3);
+		/* Convert to little-endian. */
+		s01 = vrev16q_u8(s01);
+
+		/* Vertical add, consolidate outside the loop. */
+		bytes += vaddq_u32(bytes, s01);
+		/* Segregate to packet length and data length. */
+		s23 = vqtbl1q_u8(s01, mask1);
+		s01 = vqtbl1q_u8(s01, mask0);
+
+		/* Store packet length and data length to mbuf. */
+		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
+		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
+		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
+		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
+
+		/* Reset rearm data. */
+		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
+
+		rx_pkts[pkts++] = m0;
+		rx_pkts[pkts++] = m1;
+		rx_pkts[pkts++] = m2;
+		rx_pkts[pkts++] = m3;
+		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+	}
+	droq->read_idx = idx0;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+#if defined(RTE_ARCH_32)
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 0);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 1);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 2);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 3);
+#else
+	droq->stats.bytes_received += vaddvq_u32(bytes);
+#endif
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e8ae56018d..d5d40b23a1 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -29,7 +29,11 @@ if arch_subdir == 'x86'
     endif
 endif

-extra_flags = ['-Wno-strict-aliasing']
+if arch_subdir == 'arm'
+    sources += files('cnxk_ep_rx_neon.c')
+endif
+
+extra_flags = ['-Wno-strict-aliasing', '-flax-vector-conversions']
 foreach flag: extra_flags
     if cc.has_argument(flag)
         cflags += flag
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 42a97ea110..8daa7d225c 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -59,6 +59,8 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_neon;
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -71,8 +73,9 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_neon;
 #endif
-
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
 	} else {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 8f306bd94e..f5bc807dc0 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -60,12 +60,18 @@ cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budge
 uint16_t
 cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);

+uint16_t
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);

 uint16_t
 cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);

+uint16_t
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
--
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v5 1/2] net/octeon_ep: improve Rx performance
  2024-02-02 11:31     ` [PATCH v4 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  2024-02-02 11:31       ` [PATCH v4 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-02 15:06       ` pbhagavatula
  2024-02-02 15:06         ` [PATCH v5 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
  2024-02-02 15:09         ` [PATCH v6 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 2 replies; 16+ messages in thread
From: pbhagavatula @ 2024-02-02 15:06 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation with distro gcc.
 v3 Changes:
 - Fix aarch32 compilation.
 v4 Changes:
 - Fix checkpatch.
 v5 Changes:
 - Update release notes.

 doc/guides/rel_notes/release_24_03.rst |  2 ++
 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 6 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index 282a3f9c8c..c8fcaaad6d 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -59,6 +59,8 @@ New Features

   * Optimize mbuf rearm sequence.
   * Updated Tx queue mbuf free thresholds from 128 to 256 for better performance.
+  * Updated Rx queue mbuf refill routine to use mempool alloc and reorder it
+    to avoid mbuf write commits.
   * Added optimized SSE Rx routines.
   * Added optimized AVX2 Rx routines.

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;

 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);

+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;

-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}

 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;

 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }

+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));

 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);

 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@ static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;

 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];

 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }

 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@

 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)

 static inline uint32_t
--
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v5 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-02 15:06       ` [PATCH v5 1/2] net/octeon_ep: improve Rx performance pbhagavatula
@ 2024-02-02 15:06         ` pbhagavatula
  2024-02-05  6:16           ` Jerin Jacob
  2024-02-02 15:09         ` [PATCH v6 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  1 sibling, 1 reply; 16+ messages in thread
From: pbhagavatula @ 2024-02-02 15:06 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx ARM NEON SIMD routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/rel_notes/release_24_03.rst  |   1 +
 drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 148 ++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build       |   6 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
 drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
 5 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index c8fcaaad6d..7a83b545cc 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -63,6 +63,7 @@ New Features
     to avoid mbuf write commits.
   * Added optimized SSE Rx routines.
   * Added optimized AVX2 Rx routines.
+  * Added optimized NEON Rx routines.
 
 * **Updated Marvell cnxk net driver.**
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_neon.c b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
new file mode 100644
index 0000000000..8abd8711e1
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+			      uint16_t new_pkts)
+{
+	const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
+				  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
+	const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
+				  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t pidx0, pidx1, pidx2, pidx3;
+	struct rte_mbuf *m0, *m1, *m2, *m3;
+	uint32_t read_idx = droq->read_idx;
+	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
+	uint64x2_t s01, s23;
+	uint32x4_t bytes;
+	uint16_t pkts = 0;
+
+	idx0 = read_idx;
+	s01 = vdupq_n_u64(0);
+	bytes = vdupq_n_u32(0);
+	while (pkts < new_pkts) {
+
+		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
+		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
+		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
+
+		if (new_pkts - pkts > 4) {
+			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
+			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
+			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
+
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
+		}
+
+		m0 = recv_buf_list[idx0];
+		m1 = recv_buf_list[idx1];
+		m2 = recv_buf_list[idx2];
+		m3 = recv_buf_list[idx3];
+
+		/* Load packet size big-endian. */
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 0);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 1);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 2);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 3);
+		/* Convert to little-endian. */
+		s01 = vrev16q_u8(s01);
+
+		/* Vertical add, consolidate outside the loop. */
+		bytes += vaddq_u32(bytes, s01);
+		/* Segregate to packet length and data length. */
+		s23 = vqtbl1q_u8(s01, mask1);
+		s01 = vqtbl1q_u8(s01, mask0);
+
+		/* Store packet length and data length to mbuf. */
+		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
+		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
+		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
+		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
+
+		/* Reset rearm data. */
+		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
+
+		rx_pkts[pkts++] = m0;
+		rx_pkts[pkts++] = m1;
+		rx_pkts[pkts++] = m2;
+		rx_pkts[pkts++] = m3;
+		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+	}
+	droq->read_idx = idx0;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+#if defined(RTE_ARCH_32)
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 0);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 1);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 2);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 3);
+#else
+	droq->stats.bytes_received += vaddvq_u32(bytes);
+#endif
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e8ae56018d..d5d40b23a1 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -29,7 +29,11 @@ if arch_subdir == 'x86'
     endif
 endif
 
-extra_flags = ['-Wno-strict-aliasing']
+if arch_subdir == 'arm'
+    sources += files('cnxk_ep_rx_neon.c')
+endif
+
+extra_flags = ['-Wno-strict-aliasing', '-flax-vector-conversions']
 foreach flag: extra_flags
     if cc.has_argument(flag)
         cflags += flag
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 42a97ea110..8daa7d225c 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -59,6 +59,8 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_neon;
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -71,8 +73,9 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_neon;
 #endif
-
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
 	} else {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 8f306bd94e..f5bc807dc0 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -60,12 +60,18 @@ cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budge
 uint16_t
 cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
 uint16_t
 cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v6 1/2] net/octeon_ep: improve Rx performance
  2024-02-02 15:06       ` [PATCH v5 1/2] net/octeon_ep: improve Rx performance pbhagavatula
  2024-02-02 15:06         ` [PATCH v5 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-02 15:09         ` pbhagavatula
  2024-02-02 15:09           ` [PATCH v6 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
  1 sibling, 1 reply; 16+ messages in thread
From: pbhagavatula @ 2024-02-02 15:09 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation with distro gcc.
 v3 Changes:
 - Fix aarch32 compilation.
 v4 Changes:
 - Fix checkpatch.
 v5 Changes:
 - Update release notes.
 v6 Changes:
 - Fix checkpatch again.

 doc/guides/rel_notes/release_24_03.rst |  2 ++
 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 6 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index 282a3f9c8c..c8fcaaad6d 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -59,6 +59,8 @@ New Features

   * Optimize mbuf rearm sequence.
   * Updated Tx queue mbuf free thresholds from 128 to 256 for better performance.
+  * Updated Rx queue mbuf refill routine to use mempool alloc and reorder it
+    to avoid mbuf write commits.
   * Added optimized SSE Rx routines.
   * Added optimized AVX2 Rx routines.

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;

 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);

+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;

-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}

 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;

 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }

+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));

 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);

 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@ static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;

 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];

 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }

 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@

 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)

 static inline uint32_t
--
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v6 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-02 15:09         ` [PATCH v6 1/2] net/octeon_ep: improve Rx performance pbhagavatula
@ 2024-02-02 15:09           ` pbhagavatula
  0 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2024-02-02 15:09 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx ARM NEON SIMD routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/rel_notes/release_24_03.rst  |   1 +
 drivers/net/octeon_ep/cnxk_ep_rx_neon.c | 147 ++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build       |   6 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c   |   5 +-
 drivers/net/octeon_ep/otx_ep_rxtx.h     |   6 +
 5 files changed, 163 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_neon.c

diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index c8fcaaad6d..7a83b545cc 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -63,6 +63,7 @@ New Features
     to avoid mbuf write commits.
   * Added optimized SSE Rx routines.
   * Added optimized AVX2 Rx routines.
+  * Added optimized NEON Rx routines.
 
 * **Updated Marvell cnxk net driver.**
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_neon.c b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
new file mode 100644
index 0000000000..4c46a7ea08
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_neon.c
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+			      uint16_t new_pkts)
+{
+	const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
+				  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
+	const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
+				  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t pidx0, pidx1, pidx2, pidx3;
+	struct rte_mbuf *m0, *m1, *m2, *m3;
+	uint32_t read_idx = droq->read_idx;
+	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
+	uint64x2_t s01, s23;
+	uint32x4_t bytes;
+	uint16_t pkts = 0;
+
+	idx0 = read_idx;
+	s01 = vdupq_n_u64(0);
+	bytes = vdupq_n_u32(0);
+	while (pkts < new_pkts) {
+		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
+		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
+		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
+
+		if (new_pkts - pkts > 4) {
+			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
+			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
+			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
+
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
+			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
+		}
+
+		m0 = recv_buf_list[idx0];
+		m1 = recv_buf_list[idx1];
+		m2 = recv_buf_list[idx2];
+		m3 = recv_buf_list[idx3];
+
+		/* Load packet size big-endian. */
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 0);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 1);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 2);
+		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				     s01, 3);
+		/* Convert to little-endian. */
+		s01 = vrev16q_u8(s01);
+
+		/* Vertical add, consolidate outside the loop. */
+		bytes += vaddq_u32(bytes, s01);
+		/* Segregate to packet length and data length. */
+		s23 = vqtbl1q_u8(s01, mask1);
+		s01 = vqtbl1q_u8(s01, mask0);
+
+		/* Store packet length and data length to mbuf. */
+		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
+		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
+		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
+		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
+
+		/* Reset rearm data. */
+		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
+		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
+
+		rx_pkts[pkts++] = m0;
+		rx_pkts[pkts++] = m1;
+		rx_pkts[pkts++] = m2;
+		rx_pkts[pkts++] = m3;
+		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
+	}
+	droq->read_idx = idx0;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+#if defined(RTE_ARCH_32)
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 0);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 1);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 2);
+	droq->stats.bytes_received += vgetq_lane_u32(bytes, 3);
+#else
+	droq->stats.bytes_received += vaddvq_u32(bytes);
+#endif
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e8ae56018d..d5d40b23a1 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -29,7 +29,11 @@ if arch_subdir == 'x86'
     endif
 endif
 
-extra_flags = ['-Wno-strict-aliasing']
+if arch_subdir == 'arm'
+    sources += files('cnxk_ep_rx_neon.c')
+endif
+
+extra_flags = ['-Wno-strict-aliasing', '-flax-vector-conversions']
 foreach flag: extra_flags
     if cc.has_argument(flag)
         cflags += flag
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 42a97ea110..8daa7d225c 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -59,6 +59,8 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_neon;
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -71,8 +73,9 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
 #endif
+#elif defined(RTE_ARCH_ARM64)
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_neon;
 #endif
-
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
 	} else {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 8f306bd94e..f5bc807dc0 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -60,12 +60,18 @@ cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budge
 uint16_t
 cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
 uint16_t
 cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 2/2] net/octeon_ep: add Rx NEON routine
  2024-02-02 15:06         ` [PATCH v5 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
@ 2024-02-05  6:16           ` Jerin Jacob
  0 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2024-02-05  6:16 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Ruifeng Wang, Vamsi Attunuru, dev

On Fri, Feb 2, 2024 at 8:54 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add Rx ARM NEON SIMD routine.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Series applied to dpdk-next-net-mrvl/for-main. Thanks

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-02-05  6:17 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-21 16:43 [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
2024-01-21 16:43 ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
2024-02-01 16:38   ` Jerin Jacob
2024-02-01 22:23 ` [PATCH v2 1/2] net/octeon_ep: improve Rx performance pbhagavatula
2024-02-01 22:23   ` [PATCH v2 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
2024-02-02  8:11     ` Jerin Jacob
2024-02-02  8:43   ` [PATCH v3 1/2] net/octeon_ep: improve Rx performance pbhagavatula
2024-02-02  8:43     ` [PATCH v3 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
2024-02-02 11:09       ` Jerin Jacob
2024-02-02 11:31     ` [PATCH v4 1/2] net/octeon_ep: improve Rx performance pbhagavatula
2024-02-02 11:31       ` [PATCH v4 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
2024-02-02 15:06       ` [PATCH v5 1/2] net/octeon_ep: improve Rx performance pbhagavatula
2024-02-02 15:06         ` [PATCH v5 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula
2024-02-05  6:16           ` Jerin Jacob
2024-02-02 15:09         ` [PATCH v6 1/2] net/octeon_ep: improve Rx performance pbhagavatula
2024-02-02 15:09           ` [PATCH v6 2/2] net/octeon_ep: add Rx NEON routine pbhagavatula

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).