[PATCH v2 0/2] net/idpf: enable AVX2 for split queue Rx/Tx

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH v2 0/2] net/idpf: enable AVX2 for split queue Rx/Tx
       [not found] <20250917052658.582872-1-shaiq.wani@intel.com/>
@ 2025-09-25  9:20 ` Shaiq Wani
  2025-09-25  9:20   ` [PATCH v2 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-09-25  9:20   ` [PATCH v2 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2025-09-26  8:54 ` [PATCH v3 0/2] enable AVX2 for split queue Rx/Tx Shaiq Wani
  1 sibling, 2 replies; 10+ messages in thread
From: Shaiq Wani @ 2025-09-25  9:20 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.

Shaiq Wani (2):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx

 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   7 +
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   6 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 451 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 5 files changed, 474 insertions(+)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 1/2] net/idpf: enable AVX2 for split queue Rx
  2025-09-25  9:20 ` [PATCH v2 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-09-25  9:20   ` Shaiq Wani
  2025-09-25 16:38     ` Bruce Richardson
  2025-09-25  9:20   ` [PATCH v2 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  1 sibling, 1 reply; 10+ messages in thread
From: Shaiq Wani @ 2025-09-25  9:20 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   7 +
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 249 ++++++++++++++++++
 4 files changed, 260 insertions(+)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index 3b95d519c6..f9c60ba229 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -49,6 +49,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_SPLITQ_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a2b8c372d6..ecb12cfd0a 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -1656,6 +1656,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_SPLITQ_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+		.info = "Split AVX2 Vector",
+		.features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+		}},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..3a9af06c86 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -252,6 +252,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..b24653f195 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,255 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+static __rte_always_inline void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Try to bulk allocate mbufs from mempool */
+	if (rte_mempool_get_bulk(rx_bufq->mp,
+				(void **)rxep,
+				IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= rx_bufq->nb_rx_desc) {
+			__m128i zero_dma = _mm_setzero_si128();
+
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxep[i] = &rx_bufq->fake_mbuf;
+				_mm_storeu_si128((__m128i *)(uintptr_t)&rxdp[i], zero_dma);
+			}
+		}
+			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+							IDPF_RXQ_REARM_THRESH,
+							rte_memory_order_relaxed);
+		return;
+	}
+
+	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
+		struct rte_mbuf *mb0 = rxep[0];
+		struct rte_mbuf *mb1 = rxep[1];
+
+		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
+static __rte_always_inline void
+idpf_splitq_rearm_avx2(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mempool_cache *cache =
+		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	if (unlikely(!cache)) {
+		idpf_splitq_rearm_common(rx_bufq);
+		return;
+	}
+
+	if (cache->len < IDPF_RXQ_REARM_THRESH) {
+		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - cache->len);
+		int ret = rte_mempool_ops_dequeue_bulk(rx_bufq->mp,
+						&cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+				__m128i dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+					rxp[i] = &rx_bufq->fake_mbuf;
+					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
+					dma_addr0);
+				}
+			}
+			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+				IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+			return;
+		}
+	}
+	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+	const int step = 2;
+
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += step, rxp += step, rxdp += step) {
+		struct rte_mbuf *mb0 = (struct rte_mbuf *)cache->objs[--cache->len];
+		struct rte_mbuf *mb1 = (struct rte_mbuf *)cache->objs[--cache->len];
+		rxp[0] = mb0;
+		rxp[1] = mb1;
+
+		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+				(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+static __rte_always_inline uint16_t
+_idpf_splitq_recv_raw_pkts_vec_avx2(struct idpf_rx_queue *rxq,
+					struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	const uint32_t *ptype_tbl = rxq->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_avx2(rxq->bufq2);
+
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != rxq->expected_gen_id)
+		return 0;
+
+	const __m128i gen_mask =
+		_mm_set1_epi64x(((uint64_t)rxq->expected_gen_id) << 46);
+
+	uint16_t received = 0;
+	for (uint16_t i = 0; i < nb_pkts; i += 4, rxdp += 4) {
+		/* Step 1: pull mbufs */
+		__m128i ptrs = _mm_loadu_si128((__m128i *)&sw_ring[i]);
+		_mm_storeu_si128((__m128i *)&rx_pkts[i], ptrs);
+
+		/* Step 2: load descriptors */
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+		rte_compiler_barrier();
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		rte_compiler_barrier();
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		rte_compiler_barrier();
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+
+		/* Step 3: shuffle out pkt_len, data_len, vlan, rss */
+		const __m256i shuf = _mm256_set_epi8(
+			/* descriptor 3 */
+			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+			/* descriptor 2 */
+			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+		);
+		__m128i d01_lo = d0, d01_hi = d1;
+		__m128i d23_lo = d2, d23_hi = d3;
+
+		__m256i m23 = _mm256_shuffle_epi8(_mm256_set_m128i(d23_hi, d23_lo), shuf);
+		__m256i m01 = _mm256_shuffle_epi8(_mm256_set_m128i(d01_hi, d01_lo), shuf);
+
+		/* Step 4: extract ptypes */
+		const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+		__m256i pt23 = _mm256_and_si256(_mm256_set_m128i(d23_hi, d23_lo), ptype_mask);
+		__m256i pt01 = _mm256_and_si256(_mm256_set_m128i(d01_hi, d01_lo), ptype_mask);
+
+		uint16_t ptype2 = _mm256_extract_epi16(pt23, 1);
+		uint16_t ptype3 = _mm256_extract_epi16(pt23, 9);
+		uint16_t ptype0 = _mm256_extract_epi16(pt01, 1);
+		uint16_t ptype1 = _mm256_extract_epi16(pt01, 9);
+
+		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype3], 2);
+		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype2], 0);
+		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype1], 2);
+		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype0], 0);
+
+		/* Step 5: extract gen bits */
+		__m128i sts0 = _mm_srli_epi64(d0, 46);
+		__m128i sts1 = _mm_srli_epi64(d1, 46);
+		__m128i sts2 = _mm_srli_epi64(d2, 46);
+		__m128i sts3 = _mm_srli_epi64(d3, 46);
+
+		__m128i merged_lo = _mm_unpacklo_epi64(sts0, sts2);
+		__m128i merged_hi = _mm_unpacklo_epi64(sts1, sts3);
+		__m128i valid = _mm_and_si128(_mm_and_si128(merged_lo, merged_hi),
+						  _mm_unpacklo_epi64(gen_mask, gen_mask));
+		__m128i cmp = _mm_cmpeq_epi64(valid, _mm_unpacklo_epi64(gen_mask, gen_mask));
+		int burst = _mm_movemask_pd(_mm_castsi128_pd(cmp));
+
+		/* Step 6: write rearm_data safely */
+		__m128i m01_lo = _mm256_castsi256_si128(m01);
+		__m128i m23_lo = _mm256_castsi256_si128(m23);
+
+		uint64_t tmp01[2], tmp23[2];
+		_mm_storeu_si128((__m128i *)tmp01, m01_lo);
+		_mm_storeu_si128((__m128i *)tmp23, m23_lo);
+		*(uint64_t *)&rx_pkts[i]->rearm_data = tmp01[0];
+		*(uint64_t *)&rx_pkts[i + 1]->rearm_data = tmp01[1];
+		*(uint64_t *)&rx_pkts[i + 2]->rearm_data = tmp23[0];
+		*(uint64_t *)&rx_pkts[i + 3]->rearm_data = tmp23[1];
+
+		received += burst;
+		if (burst != 4)
+			break;
+	}
+
+	rxq->rx_tail += received;
+	if (received & 1) {
+		rxq->rx_tail &= ~(uint16_t)1;
+		received--;
+	}
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
+	rxq->bufq2->rxrearm_nb += received;
+
+	return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
+				 uint16_t nb_pkts)
+{
+	return _idpf_splitq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
+}
+
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 2/2] net/idpf: enable AVX2 for split queue Tx
  2025-09-25  9:20 ` [PATCH v2 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-09-25  9:20   ` [PATCH v2 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-09-25  9:20   ` Shaiq Wani
  2025-09-25 16:47     ` Bruce Richardson
  1 sibling, 1 reply; 10+ messages in thread
From: Shaiq Wani @ 2025-09-25  9:20 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 202 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 214 insertions(+)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3a9af06c86..ef3199524a 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -262,6 +262,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index b24653f195..7b28c6b32d 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -889,3 +889,205 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(volatile struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+	IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+	((uint64_t)flags) |
+	((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+				pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
+}
+
+
+static inline void
+idpf_splitq_vtx_avx2(volatile struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+	((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+
+		_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp + 2), desc2_3);
+		_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp), desc0_1);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+
+	tx_id = txq->tx_tail;
+
+	/* restrict to max burst size */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	/* make sure we have enough free space */
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = &txq->desc_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+		txep += tx_id;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs)
+		txq->tx_next_rs =
+		(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+static __rte_always_inline uint16_t
+idpf_splitq_xmit_pkts_vec_avx2_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue,
+								&tx_pkts[nb_tx],
+								num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	return idpf_splitq_xmit_pkts_vec_avx2_cmn(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 1c725065df..6950fabb49 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/2] net/idpf: enable AVX2 for split queue Rx
  2025-09-25  9:20   ` [PATCH v2 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-09-25 16:38     ` Bruce Richardson
  0 siblings, 0 replies; 10+ messages in thread
From: Bruce Richardson @ 2025-09-25 16:38 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Thu, Sep 25, 2025 at 02:50:19PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---

Hi,

a few small comments inline below.

Thanks,
/Bruce

>  drivers/net/intel/idpf/idpf_common_device.h   |   1 +
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |   7 +
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 249 ++++++++++++++++++
>  4 files changed, 260 insertions(+)
> 
> diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
> index 3b95d519c6..f9c60ba229 100644
> --- a/drivers/net/intel/idpf/idpf_common_device.h
> +++ b/drivers/net/intel/idpf/idpf_common_device.h
> @@ -49,6 +49,7 @@ enum idpf_rx_func_type {
>  	IDPF_RX_SINGLEQ,
>  	IDPF_RX_SINGLEQ_SCATTERED,
>  	IDPF_RX_SINGLEQ_AVX2,
> +	IDPF_RX_SPLITQ_AVX2,

The scalar splitq receive is listed here just as IDPF_RX_DEFAULT, and the
avx-512 splitq as IDPF_RX_AVX512, so following that scheme this should just
be IDPF_RX_AVX2. Alternatively, for consistency you could also rename those
others to be IDPF_RX_SPLITQ and IDPF_RX_SPLITQ_AVX512. Either way naming
consistency should be achievable I think.

>  	IDPF_RX_AVX512,
>  	IDPF_RX_SINGLQ_AVX512,
>  	IDPF_RX_MAX
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
> index a2b8c372d6..ecb12cfd0a 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
> @@ -1656,6 +1656,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>  			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>  			.simd_width = RTE_VECT_SIMD_256,
>  			.extra.single_queue = true}},
> +	[IDPF_RX_SPLITQ_AVX2] = {
> +		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
> +		.info = "Split AVX2 Vector",
> +		.features = {
> +			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
> +			.simd_width = RTE_VECT_SIMD_256,
> +		}},
>  #ifdef CC_AVX512_SUPPORT
>  	[IDPF_RX_AVX512] = {
>  		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 3bc3323af4..3a9af06c86 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -252,6 +252,9 @@ __rte_internal
>  uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
>  					 uint16_t nb_pkts);
>  __rte_internal
> +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
> +				     uint16_t nb_pkts);
> +__rte_internal
>  uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			  uint16_t nb_pkts);
>  __rte_internal
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index 21c8f79254..b24653f195 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -482,6 +482,255 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
>  	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
>  }
>  
> +static __rte_always_inline void
> +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	/* Try to bulk allocate mbufs from mempool */
> +	if (rte_mempool_get_bulk(rx_bufq->mp,
> +				(void **)rxep,
> +				IDPF_RXQ_REARM_THRESH) < 0) {

Use rte_mbuf_raw_alloc_bulk() instead of rte_mempool_get_bulk(), it has
some extra sanity checks for debug builds and ensures that we don't bypass
too many library layers.

> +		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= rx_bufq->nb_rx_desc) {
> +			__m128i zero_dma = _mm_setzero_si128();
> +
> +			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +				rxep[i] = &rx_bufq->fake_mbuf;
> +				_mm_storeu_si128((__m128i *)(uintptr_t)&rxdp[i], zero_dma);
> +			}
> +		}
> +			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +							IDPF_RXQ_REARM_THRESH,
> +							rte_memory_order_relaxed);
> +		return;
> +	}
> +
> +	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
> +
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
> +		struct rte_mbuf *mb0 = rxep[0];
> +		struct rte_mbuf *mb1 = rxep[1];
> +
> +		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
> +		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
> +
> +		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
> +
> +		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
> +		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
> +	}
> +
> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +			(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}
> +
> +static __rte_always_inline void
> +idpf_splitq_rearm_avx2(struct idpf_rx_queue *rx_bufq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	struct rte_mempool_cache *cache =
> +		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
> +	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	if (unlikely(!cache)) {
> +		idpf_splitq_rearm_common(rx_bufq);
> +		return;
> +	}
> +
> +	if (cache->len < IDPF_RXQ_REARM_THRESH) {
> +		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - cache->len);
> +		int ret = rte_mempool_ops_dequeue_bulk(rx_bufq->mp,
> +						&cache->objs[cache->len], req);
> +		if (ret == 0) {
> +			cache->len += req;
> +		} else {
> +			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> +				rx_bufq->nb_rx_desc) {
> +				__m128i dma_addr0 = _mm_setzero_si128();
> +				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +					rxp[i] = &rx_bufq->fake_mbuf;
> +					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
> +					dma_addr0);
> +				}
> +			}
> +			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +				IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +			return;
> +		}
> +	}
> +	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
> +	const int step = 2;
> +
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += step, rxp += step, rxdp += step) {
> +		struct rte_mbuf *mb0 = (struct rte_mbuf *)cache->objs[--cache->len];
> +		struct rte_mbuf *mb1 = (struct rte_mbuf *)cache->objs[--cache->len];
> +		rxp[0] = mb0;
> +		rxp[1] = mb1;
> +
> +		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
> +		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
> +
> +		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
> +
> +		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
> +		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
> +	}
> +
> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +				(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}
> +static __rte_always_inline uint16_t
> +_idpf_splitq_recv_raw_pkts_vec_avx2(struct idpf_rx_queue *rxq,
> +					struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> +{
> +	const uint32_t *ptype_tbl = rxq->adapter->ptype_tbl;
> +	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
> +	volatile union virtchnl2_rx_desc *rxdp =
> +		(volatile union virtchnl2_rx_desc *)rxq->rx_ring + rxq->rx_tail;
> +
> +	rte_prefetch0(rxdp);
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
> +
> +	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
> +		idpf_splitq_rearm_avx2(rxq->bufq2);
> +
> +	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != rxq->expected_gen_id)
> +		return 0;
> +
> +	const __m128i gen_mask =
> +		_mm_set1_epi64x(((uint64_t)rxq->expected_gen_id) << 46);
> +
> +	uint16_t received = 0;
> +	for (uint16_t i = 0; i < nb_pkts; i += 4, rxdp += 4) {
> +		/* Step 1: pull mbufs */
> +		__m128i ptrs = _mm_loadu_si128((__m128i *)&sw_ring[i]);
> +		_mm_storeu_si128((__m128i *)&rx_pkts[i], ptrs);
> +
> +		/* Step 2: load descriptors */
> +		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
> +		rte_compiler_barrier();
> +		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
> +		rte_compiler_barrier();
> +		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
> +		rte_compiler_barrier();
> +		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
> +
> +		/* Step 3: shuffle out pkt_len, data_len, vlan, rss */
> +		const __m256i shuf = _mm256_set_epi8(
> +			/* descriptor 3 */
> +			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
> +			/* descriptor 2 */
> +			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
> +		);
> +		__m128i d01_lo = d0, d01_hi = d1;
> +		__m128i d23_lo = d2, d23_hi = d3;
> +
> +		__m256i m23 = _mm256_shuffle_epi8(_mm256_set_m128i(d23_hi, d23_lo), shuf);
> +		__m256i m01 = _mm256_shuffle_epi8(_mm256_set_m128i(d01_hi, d01_lo), shuf);
> +
> +		/* Step 4: extract ptypes */
> +		const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
> +		__m256i pt23 = _mm256_and_si256(_mm256_set_m128i(d23_hi, d23_lo), ptype_mask);
> +		__m256i pt01 = _mm256_and_si256(_mm256_set_m128i(d01_hi, d01_lo), ptype_mask);
> +
> +		uint16_t ptype2 = _mm256_extract_epi16(pt23, 1);
> +		uint16_t ptype3 = _mm256_extract_epi16(pt23, 9);
> +		uint16_t ptype0 = _mm256_extract_epi16(pt01, 1);
> +		uint16_t ptype1 = _mm256_extract_epi16(pt01, 9);
> +
> +		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype3], 2);
> +		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype2], 0);
> +		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype1], 2);
> +		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype0], 0);
> +
> +		/* Step 5: extract gen bits */
> +		__m128i sts0 = _mm_srli_epi64(d0, 46);
> +		__m128i sts1 = _mm_srli_epi64(d1, 46);
> +		__m128i sts2 = _mm_srli_epi64(d2, 46);
> +		__m128i sts3 = _mm_srli_epi64(d3, 46);
> +
> +		__m128i merged_lo = _mm_unpacklo_epi64(sts0, sts2);
> +		__m128i merged_hi = _mm_unpacklo_epi64(sts1, sts3);
> +		__m128i valid = _mm_and_si128(_mm_and_si128(merged_lo, merged_hi),
> +						  _mm_unpacklo_epi64(gen_mask, gen_mask));
> +		__m128i cmp = _mm_cmpeq_epi64(valid, _mm_unpacklo_epi64(gen_mask, gen_mask));
> +		int burst = _mm_movemask_pd(_mm_castsi128_pd(cmp));
> +
> +		/* Step 6: write rearm_data safely */
> +		__m128i m01_lo = _mm256_castsi256_si128(m01);
> +		__m128i m23_lo = _mm256_castsi256_si128(m23);
> +
> +		uint64_t tmp01[2], tmp23[2];
> +		_mm_storeu_si128((__m128i *)tmp01, m01_lo);
> +		_mm_storeu_si128((__m128i *)tmp23, m23_lo);
> +		*(uint64_t *)&rx_pkts[i]->rearm_data = tmp01[0];
> +		*(uint64_t *)&rx_pkts[i + 1]->rearm_data = tmp01[1];
> +		*(uint64_t *)&rx_pkts[i + 2]->rearm_data = tmp23[0];
> +		*(uint64_t *)&rx_pkts[i + 3]->rearm_data = tmp23[1];
> +
> +		received += burst;
> +		if (burst != 4)
> +			break;
> +	}
> +
> +	rxq->rx_tail += received;
> +	if (received & 1) {
> +		rxq->rx_tail &= ~(uint16_t)1;
> +		received--;
> +	}
> +	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
> +	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
> +	rxq->bufq2->rxrearm_nb += received;
> +
> +	return received;
> +}
> +
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
> +uint16_t
> +idpf_dp_splitq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> +				 uint16_t nb_pkts)
> +{
> +	return _idpf_splitq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
> +}
> +

Why the extra level of functions here? In other drivers this is to separate
out common code for a single-buffer, and scattered packet version. Are
there plans to handle multi-mbuf packets here? If not, might as well
collapse the two functions down to one.

> +
>  static inline void
>  idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
>  		  struct rte_mbuf *pkt, uint64_t flags)
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/2] net/idpf: enable AVX2 for split queue Tx
  2025-09-25  9:20   ` [PATCH v2 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2025-09-25 16:47     ` Bruce Richardson
  0 siblings, 0 replies; 10+ messages in thread
From: Bruce Richardson @ 2025-09-25 16:47 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Thu, Sep 25, 2025 at 02:50:20PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---

Hi,

see comments inline below.

/Bruce

>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 202 ++++++++++++++++++
>  drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
>  3 files changed, 214 insertions(+)
> 
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 3a9af06c86..ef3199524a 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -262,6 +262,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
>  					struct rte_mbuf **rx_pkts,
>  					uint16_t nb_pkts);
>  __rte_internal
> +uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
> +				       uint16_t nb_pkts);
> +__rte_internal
>  uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
>  					struct rte_mbuf **tx_pkts,
>  					uint16_t nb_pkts);
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index b24653f195..7b28c6b32d 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -889,3 +889,205 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
>  
>  	return nb_tx;
>  }
> +
> +static __rte_always_inline void
> +idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
> +{
> +	struct idpf_splitq_tx_compl_desc *compl_ring;
> +	struct ci_tx_queue *txq;
> +	uint16_t genid, txq_qid, cq_qid, i;
> +	uint8_t ctype;
> +
> +	cq_qid = cq->tx_tail;
> +
> +	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
> +		if (cq_qid == cq->nb_tx_desc) {
> +			cq_qid = 0;
> +			cq->expected_gen_id ^= 1;  /* toggle generation bit */
> +		}
> +
> +		compl_ring = &cq->compl_ring[cq_qid];
> +
> +		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
> +			 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
> +
> +		if (genid != cq->expected_gen_id)
> +			break;
> +
> +		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
> +			 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
> +
> +		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
> +			   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
> +
> +		txq = cq->txqs[txq_qid - cq->tx_start_qid];
> +		if (ctype == IDPF_TXD_COMPLT_RS)
> +			txq->rs_compl_count++;
> +
> +		cq_qid++;
> +	}
> +
> +	cq->tx_tail = cq_qid;
> +}
> +
> +static __rte_always_inline void
> +idpf_splitq_vtx1_avx2(volatile struct idpf_flex_tx_sched_desc *txdp,
> +				struct rte_mbuf *pkt, uint64_t flags)
> +{
> +	uint64_t high_qw =
> +	IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
> +	((uint64_t)flags) |
> +	((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
> +
> +	__m128i descriptor = _mm_set_epi64x(high_qw,
> +				pkt->buf_iova + pkt->data_off);

Indentation is a little off here. The high_qw line continuations should be
indented beyond the first tab-stop. For the descriptor definition, a triple
indent is a bit excessive, double indent should be sufficient.

> +	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
> +}
> +
> +
> +static inline void
> +idpf_splitq_vtx_avx2(volatile struct idpf_flex_tx_sched_desc *txdp,
> +				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
> +{
> +	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
> +	((uint64_t)flags);

Again, indent more here.

> +
> +	/* align if needed */
> +	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
> +		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
> +		txdp++, pkt++, nb_pkts--;
> +	}
> +
> +	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
> +		uint64_t hi_qw3 = hi_qw_tmpl |
> +			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
> +		uint64_t hi_qw2 = hi_qw_tmpl |
> +			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
> +		uint64_t hi_qw1 = hi_qw_tmpl |
> +			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
> +		uint64_t hi_qw0 = hi_qw_tmpl |
> +			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
> +
> +		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
> +			pkt[3]->buf_iova + pkt[3]->data_off,
> +			hi_qw2,
> +			pkt[2]->buf_iova + pkt[2]->data_off);
> +
> +		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
> +			pkt[1]->buf_iova + pkt[1]->data_off,
> +			hi_qw0,
> +			pkt[0]->buf_iova + pkt[0]->data_off);
> +
> +		_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp + 2), desc2_3);
> +		_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp), desc0_1);
> +	}

Rather than casting away the volatile here, did you consider casting it
away earlier and passing in txdp as non-volatile parameter to this
function?

> +
> +	while (nb_pkts--) {
> +		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
> +		txdp++;
> +		pkt++;
> +	}
> +}
> +
> +static inline uint16_t
> +idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
> +						uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> +	volatile struct idpf_flex_tx_sched_desc *txdp;
> +	struct ci_tx_entry_vec *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
> +
> +	tx_id = txq->tx_tail;
> +
> +	/* restrict to max burst size */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +
> +	/* make sure we have enough free space */
> +	if (txq->nb_tx_free < txq->tx_free_thresh)
> +		ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
> +
> +	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> +	nb_pkts = nb_commit;
> +	if (unlikely(nb_pkts == 0))
> +		return 0;
> +
> +	txdp = &txq->desc_ring[tx_id];

Suggestion: I would cast away the volatile here and save later casting
away.

> +	txep = (void *)txq->sw_ring;
> +	txep += tx_id;
> +
> +	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
> +
> +	n = (uint16_t)(txq->nb_tx_desc - tx_id);
> +	if (nb_commit >= n) {
> +		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
> +
> +		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
> +		tx_pkts += (n - 1);
> +		txdp += (n - 1);
> +
> +		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
> +
> +		nb_commit = (uint16_t)(nb_commit - n);
> +
> +		tx_id = 0;
> +		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> +		txdp = &txq->desc_ring[tx_id];
> +		txep = (void *)txq->sw_ring;
> +		txep += tx_id;
> +	}
> +
> +	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
> +
> +	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
> +
> +	tx_id = (uint16_t)(tx_id + nb_commit);
> +	if (tx_id > txq->tx_next_rs)
> +		txq->tx_next_rs =
> +		(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
> +
> +	txq->tx_tail = tx_id;
> +
> +	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
> +
> +	return nb_pkts;
> +}
> +
> +static __rte_always_inline uint16_t
> +idpf_splitq_xmit_pkts_vec_avx2_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
> +						uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> +	uint16_t nb_tx = 0;
> +
> +	while (nb_pkts) {
> +		uint16_t ret, num;
> +		idpf_splitq_scan_cq_ring(txq->complq);
> +
> +		if (txq->rs_compl_count > txq->tx_free_thresh) {
> +			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
> +			txq->rs_compl_count -= txq->tx_rs_thresh;
> +		}
> +
> +		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue,
> +								&tx_pkts[nb_tx],
> +								num);

Reduce over-indent here. If you only indent by 2 extra, you can put the
last two parameters on the one line.

> +		nb_tx += ret;
> +		nb_pkts -= ret;
> +		if (ret < num)
> +			break;
> +	}
> +
> +	return nb_tx;
> +}
> +
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
> +uint16_t
> +idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
> +						uint16_t nb_pkts)
> +{
> +	return idpf_splitq_xmit_pkts_vec_avx2_cmn(tx_queue, tx_pkts, nb_pkts);
> +}

Do we need a separate wrapper function here. Are there future plans for
another different wrapper around the same common function?

> diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
> index 1c725065df..6950fabb49 100644
> --- a/drivers/net/intel/idpf/idpf_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_rxtx.c
> @@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
>  				return;
>  			}
>  #endif /* CC_AVX512_SUPPORT */
> +			if (tx_simd_width == RTE_VECT_SIMD_256) {
> +				PMD_DRV_LOG(NOTICE,
> +					    "Using Split AVX2 Vector Tx (port %d).",
> +					    dev->data->port_id);
> +				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
> +				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
> +				return;
> +			}
> +
>  		}
>  		PMD_DRV_LOG(NOTICE,
>  			    "Using Split Scalar Tx (port %d).",
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 0/2] enable AVX2 for split queue Rx/Tx
       [not found] <20250917052658.582872-1-shaiq.wani@intel.com/>
  2025-09-25  9:20 ` [PATCH v2 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-09-26  8:54 ` Shaiq Wani
  2025-09-26  8:54   ` [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-09-26  8:54   ` [PATCH v3 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  1 sibling, 2 replies; 10+ messages in thread
From: Shaiq Wani @ 2025-09-26  8:54 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.

Shaiq Wani (2):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx

 drivers/net/intel/idpf/idpf_common_device.h   |   3 +-
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   9 +-
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   6 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 438 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 5 files changed, 463 insertions(+), 2 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx
  2025-09-26  8:54 ` [PATCH v3 0/2] enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-09-26  8:54   ` Shaiq Wani
  2025-09-26 11:40     ` Bruce Richardson
  2025-09-26 13:09     ` Burakov, Anatoly
  2025-09-26  8:54   ` [PATCH v3 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  1 sibling, 2 replies; 10+ messages in thread
From: Shaiq Wani @ 2025-09-26  8:54 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   3 +-
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   9 +-
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 242 ++++++++++++++++++
 4 files changed, 255 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index 3b95d519c6..982849dafd 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -49,8 +49,9 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
-	IDPF_RX_SINGLQ_AVX512,
+	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
 };
 
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a2b8c372d6..57753180a2 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -1656,6 +1656,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+	       .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+	       .info = "Split AVX2 Vector",
+	       .features = {
+		       .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+		       .simd_width = RTE_VECT_SIMD_256,
+	       }},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
@@ -1663,7 +1670,7 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 		.features = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_512}},
-	[IDPF_RX_SINGLQ_AVX512] = {
+	[IDPF_RX_SINGLEQ_AVX512] = {
 		.pkt_burst = idpf_dp_singleq_recv_pkts_avx512,
 		.info = "Single AVX512 Vector",
 		.features = {
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..3a9af06c86 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -252,6 +252,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..b00f85ce78 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,248 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+static __rte_always_inline void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Try to bulk allocate mbufs from mempool */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+				rxep,
+				IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= rx_bufq->nb_rx_desc) {
+			__m128i zero_dma = _mm_setzero_si128();
+
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxep[i] = &rx_bufq->fake_mbuf;
+				_mm_storeu_si128((__m128i *)(uintptr_t)&rxdp[i], zero_dma);
+			}
+		}
+			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+							IDPF_RXQ_REARM_THRESH,
+							rte_memory_order_relaxed);
+		return;
+	}
+
+	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
+		struct rte_mbuf *mb0 = rxep[0];
+		struct rte_mbuf *mb1 = rxep[1];
+
+		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
+static __rte_always_inline void
+idpf_splitq_rearm_avx2(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mempool_cache *cache =
+		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	if (unlikely(!cache)) {
+		idpf_splitq_rearm_common(rx_bufq);
+		return;
+	}
+
+	if (cache->len < IDPF_RXQ_REARM_THRESH) {
+		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - cache->len);
+		int ret = rte_mempool_ops_dequeue_bulk(rx_bufq->mp,
+						&cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+				__m128i dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+					rxp[i] = &rx_bufq->fake_mbuf;
+					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
+					dma_addr0);
+				}
+			}
+			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+				IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+			return;
+		}
+	}
+	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+	const int step = 2;
+
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += step, rxp += step, rxdp += step) {
+		struct rte_mbuf *mb0 = (struct rte_mbuf *)cache->objs[--cache->len];
+		struct rte_mbuf *mb1 = (struct rte_mbuf *)cache->objs[--cache->len];
+		rxp[0] = mb0;
+		rxp[1] = mb1;
+
+		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+				(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_avx2(queue->bufq2);
+
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	const __m128i gen_mask =
+		_mm_set1_epi64x(((uint64_t)queue->expected_gen_id) << 46);
+
+	uint16_t received = 0;
+	for (uint16_t i = 0; i < nb_pkts; i += 4, rxdp += 4) {
+		/* Step 1: pull mbufs */
+		__m128i ptrs = _mm_loadu_si128((__m128i *)&sw_ring[i]);
+		_mm_storeu_si128((__m128i *)&rx_pkts[i], ptrs);
+
+		/* Step 2: load descriptors */
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+		rte_compiler_barrier();
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		rte_compiler_barrier();
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		rte_compiler_barrier();
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+
+		/* Step 3: shuffle out pkt_len, data_len, vlan, rss */
+		const __m256i shuf = _mm256_set_epi8(
+			/* descriptor 3 */
+			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+			/* descriptor 2 */
+			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+		);
+		__m128i d01_lo = d0, d01_hi = d1;
+		__m128i d23_lo = d2, d23_hi = d3;
+
+		__m256i m23 = _mm256_shuffle_epi8(_mm256_set_m128i(d23_hi, d23_lo), shuf);
+		__m256i m01 = _mm256_shuffle_epi8(_mm256_set_m128i(d01_hi, d01_lo), shuf);
+
+		/* Step 4: extract ptypes */
+		const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+		__m256i pt23 = _mm256_and_si256(_mm256_set_m128i(d23_hi, d23_lo), ptype_mask);
+		__m256i pt01 = _mm256_and_si256(_mm256_set_m128i(d01_hi, d01_lo), ptype_mask);
+
+		uint16_t ptype2 = _mm256_extract_epi16(pt23, 1);
+		uint16_t ptype3 = _mm256_extract_epi16(pt23, 9);
+		uint16_t ptype0 = _mm256_extract_epi16(pt01, 1);
+		uint16_t ptype1 = _mm256_extract_epi16(pt01, 9);
+
+		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype3], 2);
+		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype2], 0);
+		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype1], 2);
+		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype0], 0);
+
+		/* Step 5: extract gen bits */
+		__m128i sts0 = _mm_srli_epi64(d0, 46);
+		__m128i sts1 = _mm_srli_epi64(d1, 46);
+		__m128i sts2 = _mm_srli_epi64(d2, 46);
+		__m128i sts3 = _mm_srli_epi64(d3, 46);
+
+		__m128i merged_lo = _mm_unpacklo_epi64(sts0, sts2);
+		__m128i merged_hi = _mm_unpacklo_epi64(sts1, sts3);
+		__m128i valid = _mm_and_si128(_mm_and_si128(merged_lo, merged_hi),
+						  _mm_unpacklo_epi64(gen_mask, gen_mask));
+		__m128i cmp = _mm_cmpeq_epi64(valid, _mm_unpacklo_epi64(gen_mask, gen_mask));
+		int burst = _mm_movemask_pd(_mm_castsi128_pd(cmp));
+
+		/* Step 6: write rearm_data safely */
+		__m128i m01_lo = _mm256_castsi256_si128(m01);
+		__m128i m23_lo = _mm256_castsi256_si128(m23);
+
+		uint64_t tmp01[2], tmp23[2];
+		_mm_storeu_si128((__m128i *)tmp01, m01_lo);
+		_mm_storeu_si128((__m128i *)tmp23, m23_lo);
+		*(uint64_t *)&rx_pkts[i]->rearm_data = tmp01[0];
+		*(uint64_t *)&rx_pkts[i + 1]->rearm_data = tmp01[1];
+		*(uint64_t *)&rx_pkts[i + 2]->rearm_data = tmp23[0];
+		*(uint64_t *)&rx_pkts[i + 3]->rearm_data = tmp23[1];
+
+		received += burst;
+		if (burst != 4)
+			break;
+	}
+
+	queue->rx_tail += received;
+	if (received & 1) {
+		queue->rx_tail &= ~(uint16_t)1;
+		received--;
+	}
+	queue->rx_tail &= (queue->nb_rx_desc - 1);
+	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+	queue->bufq2->rxrearm_nb += received;
+
+	return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 2/2] net/idpf: enable AVX2 for split queue Tx
  2025-09-26  8:54 ` [PATCH v3 0/2] enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-09-26  8:54   ` [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-09-26  8:54   ` Shaiq Wani
  1 sibling, 0 replies; 10+ messages in thread
From: Shaiq Wani @ 2025-09-26  8:54 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 195 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 207 insertions(+)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3a9af06c86..ef3199524a 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -262,6 +262,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index b00f85ce78..a91051c7b9 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -882,3 +882,198 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+		_mm256_storeu_si256((__m256i *)txdp, desc0_1);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+
+	tx_id = txq->tx_tail;
+
+	/* restrict to max burst size */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	/* make sure we have enough free space */
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+		txep += tx_id;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs)
+		txq->tx_next_rs =
+		(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue,
+								&tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 1c725065df..6950fabb49 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx
  2025-09-26  8:54   ` [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-09-26 11:40     ` Bruce Richardson
  2025-09-26 13:09     ` Burakov, Anatoly
  1 sibling, 0 replies; 10+ messages in thread
From: Bruce Richardson @ 2025-09-26 11:40 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Fri, Sep 26, 2025 at 02:24:03PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>

Hi Shaiq,

more review comments inline below.

/Bruce

> ---
>  drivers/net/intel/idpf/idpf_common_device.h   |   3 +-
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |   9 +-
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 242 ++++++++++++++++++
>  4 files changed, 255 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
> index 3b95d519c6..982849dafd 100644
> --- a/drivers/net/intel/idpf/idpf_common_device.h
> +++ b/drivers/net/intel/idpf/idpf_common_device.h
> @@ -49,8 +49,9 @@ enum idpf_rx_func_type {
>  	IDPF_RX_SINGLEQ,
>  	IDPF_RX_SINGLEQ_SCATTERED,
>  	IDPF_RX_SINGLEQ_AVX2,
> +	IDPF_RX_AVX2,
>  	IDPF_RX_AVX512,
> -	IDPF_RX_SINGLQ_AVX512,
> +	IDPF_RX_SINGLEQ_AVX512,
>  	IDPF_RX_MAX
>  };
>  
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
> index a2b8c372d6..57753180a2 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
> @@ -1656,6 +1656,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>  			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>  			.simd_width = RTE_VECT_SIMD_256,
>  			.extra.single_queue = true}},
> +	[IDPF_RX_AVX2] = {
> +	       .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
> +	       .info = "Split AVX2 Vector",
> +	       .features = {
> +		       .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
> +		       .simd_width = RTE_VECT_SIMD_256,
> +	       }},
>  #ifdef CC_AVX512_SUPPORT
>  	[IDPF_RX_AVX512] = {
>  		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
> @@ -1663,7 +1670,7 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>  		.features = {
>  			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>  			.simd_width = RTE_VECT_SIMD_512}},
> -	[IDPF_RX_SINGLQ_AVX512] = {
> +	[IDPF_RX_SINGLEQ_AVX512] = {

This renaming is good, but should really be in a separate patch as it's not
part of the AVX2 changes. Can you put it in a new small patch 1 in this
set.

>  		.pkt_burst = idpf_dp_singleq_recv_pkts_avx512,
>  		.info = "Single AVX512 Vector",
>  		.features = {
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 3bc3323af4..3a9af06c86 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -252,6 +252,9 @@ __rte_internal
>  uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
>  					 uint16_t nb_pkts);
>  __rte_internal
> +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
> +				     uint16_t nb_pkts);
> +__rte_internal
>  uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			  uint16_t nb_pkts);
>  __rte_internal
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index 21c8f79254..b00f85ce78 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -482,6 +482,248 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
>  	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
>  }
>  
> +static __rte_always_inline void
> +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	/* Try to bulk allocate mbufs from mempool */
> +	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
> +				rxep,
> +				IDPF_RXQ_REARM_THRESH) < 0) {
> +		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= rx_bufq->nb_rx_desc) {
> +			__m128i zero_dma = _mm_setzero_si128();
> +
> +			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +				rxep[i] = &rx_bufq->fake_mbuf;
> +				_mm_storeu_si128((__m128i *)(uintptr_t)&rxdp[i], zero_dma);
> +			}
> +		}
> +			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +							IDPF_RXQ_REARM_THRESH,
> +							rte_memory_order_relaxed);
> +		return;
> +	}
> +
> +	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
> +
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
> +		struct rte_mbuf *mb0 = rxep[0];
> +		struct rte_mbuf *mb1 = rxep[1];
> +
> +		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
> +		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
> +
> +		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
> +
> +		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
> +		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
> +	}
> +
> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +			(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}

Missed this on last review.

This code is almost, almost identical to the function with the exact same
name in idpf_common_rxtx_avx512.c - and the differences don't seem to be
due to avx2/avx512. Rather than duplicating code, put this in a common
location and use it from both avx2 and avx512 files.

> +
> +static __rte_always_inline void
> +idpf_splitq_rearm_avx2(struct idpf_rx_queue *rx_bufq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	struct rte_mempool_cache *cache =
> +		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
> +	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	if (unlikely(!cache)) {
> +		idpf_splitq_rearm_common(rx_bufq);
> +		return;
> +	}
> +
> +	if (cache->len < IDPF_RXQ_REARM_THRESH) {
> +		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - cache->len);
> +		int ret = rte_mempool_ops_dequeue_bulk(rx_bufq->mp,
> +						&cache->objs[cache->len], req);
> +		if (ret == 0) {
> +			cache->len += req;
> +		} else {
> +			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> +				rx_bufq->nb_rx_desc) {
> +				__m128i dma_addr0 = _mm_setzero_si128();
> +				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +					rxp[i] = &rx_bufq->fake_mbuf;
> +					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
> +					dma_addr0);
> +				}
> +			}
> +			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +				IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +			return;
> +		}
> +	}
> +	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
> +	const int step = 2;
> +
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += step, rxp += step, rxdp += step) {
> +		struct rte_mbuf *mb0 = (struct rte_mbuf *)cache->objs[--cache->len];
> +		struct rte_mbuf *mb1 = (struct rte_mbuf *)cache->objs[--cache->len];
> +		rxp[0] = mb0;
> +		rxp[1] = mb1;
> +
> +		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
> +		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
> +
> +		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
> +
> +		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
> +		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
> +	}
> +

And this code is very much the same as the "common" function above, in fact
the main block looks copy-pasted. Please rework to cut down on duplication?
How much perf benefit is got from this avx2-specific function vs the more
generic "common" one above?

> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +				(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}
> +uint16_t
> +idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> +{
> +	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
> +	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
> +	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
> +	volatile union virtchnl2_rx_desc *rxdp =
> +		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
> +
> +	rte_prefetch0(rxdp);
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
> +
> +	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
> +		idpf_splitq_rearm_avx2(queue->bufq2);
> +
> +	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
> +		return 0;
> +
> +	const __m128i gen_mask =
> +		_mm_set1_epi64x(((uint64_t)queue->expected_gen_id) << 46);
> +
> +	uint16_t received = 0;
> +	for (uint16_t i = 0; i < nb_pkts; i += 4, rxdp += 4) {
> +		/* Step 1: pull mbufs */
> +		__m128i ptrs = _mm_loadu_si128((__m128i *)&sw_ring[i]);
> +		_mm_storeu_si128((__m128i *)&rx_pkts[i], ptrs);
> +

How does this work on 64-bit? An SSE load/store is 16 bytes, which is only
2 pointers 64-bit (4 on 32-bit). Am I missing somewhere where you
load/store the other two pointers per iteration?

> +		/* Step 2: load descriptors */
> +		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
> +		rte_compiler_barrier();
> +		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
> +		rte_compiler_barrier();
> +		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
> +		rte_compiler_barrier();
> +		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
> +
> +		/* Step 3: shuffle out pkt_len, data_len, vlan, rss */
> +		const __m256i shuf = _mm256_set_epi8(
> +			/* descriptor 3 */
> +			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
> +			/* descriptor 2 */

By descriptor 3 and descriptor 2 do you maybe mean descriptors 1 and 0?

> +			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
> +		);
> +		__m128i d01_lo = d0, d01_hi = d1;
> +		__m128i d23_lo = d2, d23_hi = d3;

These variable assignments seem rather pointless.

> +
> +		__m256i m23 = _mm256_shuffle_epi8(_mm256_set_m128i(d23_hi, d23_lo), shuf);
> +		__m256i m01 = _mm256_shuffle_epi8(_mm256_set_m128i(d01_hi, d01_lo), shuf);
> +
> +		/* Step 4: extract ptypes */
> +		const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
> +		__m256i pt23 = _mm256_and_si256(_mm256_set_m128i(d23_hi, d23_lo), ptype_mask);
> +		__m256i pt01 = _mm256_and_si256(_mm256_set_m128i(d01_hi, d01_lo), ptype_mask);

I imagine the compiler is smart enough to realise it and optimize it away,
but you are still merging the descriptor pairs twice here, ones with the
shuffle and a second time here when doing masking. Rather than renaming the
variables as hi and lo 128bit values, why not merge them there into 256-bit
values.

> +
> +		uint16_t ptype2 = _mm256_extract_epi16(pt23, 1);
> +		uint16_t ptype3 = _mm256_extract_epi16(pt23, 9);
> +		uint16_t ptype0 = _mm256_extract_epi16(pt01, 1);
> +		uint16_t ptype1 = _mm256_extract_epi16(pt01, 9);
> +
> +		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype3], 2);
> +		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype2], 0);
> +		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype1], 2);
> +		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype0], 0);
> +
> +		/* Step 5: extract gen bits */
> +		__m128i sts0 = _mm_srli_epi64(d0, 46);
> +		__m128i sts1 = _mm_srli_epi64(d1, 46);
> +		__m128i sts2 = _mm_srli_epi64(d2, 46);
> +		__m128i sts3 = _mm_srli_epi64(d3, 46);
> +
> +		__m128i merged_lo = _mm_unpacklo_epi64(sts0, sts2);
> +		__m128i merged_hi = _mm_unpacklo_epi64(sts1, sts3);
> +		__m128i valid = _mm_and_si128(_mm_and_si128(merged_lo, merged_hi),
> +						  _mm_unpacklo_epi64(gen_mask, gen_mask));
> +		__m128i cmp = _mm_cmpeq_epi64(valid, _mm_unpacklo_epi64(gen_mask, gen_mask));
> +		int burst = _mm_movemask_pd(_mm_castsi128_pd(cmp));
> +
> +		/* Step 6: write rearm_data safely */
> +		__m128i m01_lo = _mm256_castsi256_si128(m01);
> +		__m128i m23_lo = _mm256_castsi256_si128(m23);
> +
> +		uint64_t tmp01[2], tmp23[2];
> +		_mm_storeu_si128((__m128i *)tmp01, m01_lo);
> +		_mm_storeu_si128((__m128i *)tmp23, m23_lo);
> +		*(uint64_t *)&rx_pkts[i]->rearm_data = tmp01[0];
> +		*(uint64_t *)&rx_pkts[i + 1]->rearm_data = tmp01[1];
> +		*(uint64_t *)&rx_pkts[i + 2]->rearm_data = tmp23[0];
> +		*(uint64_t *)&rx_pkts[i + 3]->rearm_data = tmp23[1];

Doing additional stores tends to be bad for performance. Extract the data
to do proper stores.

However, I only see 64-bits being written to each mbuf here, covering the
data_off, ref_cnt, nb_segs and port fields, which all can be set to
constant values read from the per-queue or per-port data. The "ice" driver
writes to the rearm-data in the avx2 path because it's doing a 256-bit
store covering the rearm data, the flags and the descriptor metadata. I
think here you are writing the descriptor metdata data to the rearm data
instead. Please check this.

> +
> +		received += burst;
> +		if (burst != 4)
> +			break;
> +	}
> +
> +	queue->rx_tail += received;
> +	if (received & 1) {
> +		queue->rx_tail &= ~(uint16_t)1;
> +		received--;
> +	}
> +	queue->rx_tail &= (queue->nb_rx_desc - 1);
> +	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
> +	queue->bufq2->rxrearm_nb += received;
> +
> +	return received;
> +}
> +
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
> +
>  static inline void
>  idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
>  		  struct rte_mbuf *pkt, uint64_t flags)
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx
  2025-09-26  8:54   ` [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-09-26 11:40     ` Bruce Richardson
@ 2025-09-26 13:09     ` Burakov, Anatoly
  1 sibling, 0 replies; 10+ messages in thread
From: Burakov, Anatoly @ 2025-09-26 13:09 UTC (permalink / raw)
  To: Shaiq Wani, dev, bruce.richardson, aman.deep.singh

On 9/26/2025 10:54 AM, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---

Hi Shaiq,

Bruce has already provided some feedback, so I'm only going to touch on 
thigns that weren't yet touched on.

<snip>

> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
> index a2b8c372d6..57753180a2 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
> @@ -1656,6 +1656,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>   			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>   			.simd_width = RTE_VECT_SIMD_256,
>   			.extra.single_queue = true}},
> +	[IDPF_RX_AVX2] = {
> +	       .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
> +	       .info = "Split AVX2 Vector",
> +	       .features = {
> +		       .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
> +		       .simd_width = RTE_VECT_SIMD_256,
> +	       }},

The indentation is different from surrounding code.

>   #ifdef CC_AVX512_SUPPORT
>   	[IDPF_RX_AVX512] = {
>   		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
> @@ -1663,7 +1670,7 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>   		.features = {
>   			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>   			.simd_width = RTE_VECT_SIMD_512}},
> -	[IDPF_RX_SINGLQ_AVX512] = {
> +	[IDPF_RX_SINGLEQ_AVX512] = {
>   		.pkt_burst = idpf_dp_singleq_recv_pkts_avx512,
>   		.info = "Single AVX512 Vector",
>   		.features = {

<snip>

> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 3bc3323af4..3a9af06c86 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -252,6 +252,9 @@ __rte_internal
>   uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
>   					 uint16_t nb_pkts);
>   __rte_internal
> +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
> +				     uint16_t nb_pkts);
> +__rte_internal
>   uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>   			  uint16_t nb_pkts);
>   __rte_internal
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index 21c8f79254..b00f85ce78 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -482,6 +482,248 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
>   	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
>   }
>   
> +static __rte_always_inline void
> +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	/* Try to bulk allocate mbufs from mempool */
> +	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
> +				rxep,
> +				IDPF_RXQ_REARM_THRESH) < 0) {
> +		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= rx_bufq->nb_rx_desc) {
> +			__m128i zero_dma = _mm_setzero_si128();
> +
> +			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +				rxep[i] = &rx_bufq->fake_mbuf;
> +				_mm_storeu_si128((__m128i *)(uintptr_t)&rxdp[i], zero_dma);
> +			}
> +		}
> +			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +							IDPF_RXQ_REARM_THRESH,
> +							rte_memory_order_relaxed);
> +		return;
> +	}
> +
> +	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
> +
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
> +		struct rte_mbuf *mb0 = rxep[0];
> +		struct rte_mbuf *mb1 = rxep[1];
> +
> +		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
> +		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
> +
> +		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
> +
> +		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
> +		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
> +	}
> +
> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +			(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);

All of this code looks really familiar. Perhaps porting IDPF to use 
'common' infrastructure should have been a prerequisite for this work, 
because I believe we have pretty much identical Rx rearm code there 
already (see net/intel/common/rx_vec_x86.h - there's already a scalar as 
well as SSE/AVX2/AVX512 implementations for rearm that are 32- and 
64-bit compatible). Perhaps the only problem might be that they use a 
"common descriptor format" rather than the virtchnl2 format used by the 
IDPF but as far as I can tell they're pretty much identical?

I realize it's a lot more work than this patch, but I really don't think 
adding things we know we will eventually deduplicate is a good practice.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-09-26 13:16 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20250917052658.582872-1-shaiq.wani@intel.com/>
2025-09-25  9:20 ` [PATCH v2 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2025-09-25  9:20   ` [PATCH v2 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-09-25 16:38     ` Bruce Richardson
2025-09-25  9:20   ` [PATCH v2 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2025-09-25 16:47     ` Bruce Richardson
2025-09-26  8:54 ` [PATCH v3 0/2] enable AVX2 for split queue Rx/Tx Shaiq Wani
2025-09-26  8:54   ` [PATCH v3 1/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-09-26 11:40     ` Bruce Richardson
2025-09-26 13:09     ` Burakov, Anatoly
2025-09-26  8:54   ` [PATCH v3 2/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).