DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 0/2] enable AVX2 for IDPF single queue
@ 2025-01-08 12:17 Shaiq Wani
  2025-01-08 12:17 ` [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx Shaiq Wani
  2025-01-08 12:17 ` [PATCH 2/2] common/idpf: enable AVX2 for single queue Tx Shaiq Wani
  0 siblings, 2 replies; 6+ messages in thread
From: Shaiq Wani @ 2025-01-08 12:17 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Shaiq Wani (2):
  common/idpf: enable AVX2 for single queue Rx
  common/idpf: enable AVX2 for single queue Tx

 doc/guides/rel_notes/release_25_03.rst      |   3 +
 drivers/common/idpf/idpf_common_device.h    |   2 +
 drivers/common/idpf/idpf_common_rxtx.h      |   8 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 815 ++++++++++++++++++++
 drivers/common/idpf/meson.build             |  15 +
 drivers/common/idpf/version.map             |   2 +
 drivers/net/idpf/idpf_rxtx.c                |  26 +
 7 files changed, 871 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

-- 
2.34.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx
  2025-01-08 12:17 [PATCH 0/2] enable AVX2 for IDPF single queue Shaiq Wani
@ 2025-01-08 12:17 ` Shaiq Wani
  2025-01-20 14:15   ` Bruce Richardson
  2025-01-08 12:17 ` [PATCH 2/2] common/idpf: enable AVX2 for single queue Tx Shaiq Wani
  1 sibling, 1 reply; 6+ messages in thread
From: Shaiq Wani @ 2025-01-08 12:17 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/common/idpf/idpf_common_device.h    |   1 +
 drivers/common/idpf/idpf_common_rxtx.h      |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 590 ++++++++++++++++++++
 drivers/common/idpf/meson.build             |  15 +
 drivers/common/idpf/version.map             |   1 +
 drivers/net/idpf/idpf_rxtx.c                |  12 +
 6 files changed, 623 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

diff --git a/drivers/common/idpf/idpf_common_device.h b/drivers/common/idpf/idpf_common_device.h
index bfa927a5ff..734be1c88a 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -123,6 +123,7 @@ struct idpf_vport {
 
 	bool rx_vec_allowed;
 	bool tx_vec_allowed;
+	bool rx_use_avx2;
 	bool rx_use_avx512;
 	bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h b/drivers/common/idpf/idpf_common_rxtx.h
index eeeeed12e2..f50cf5ef46 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -302,5 +302,9 @@ uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pk
 __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
+					struct rte_mbuf **rx_pkts,
+					uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c b/drivers/common/idpf/idpf_common_rxtx_avx2.c
new file mode 100644
index 0000000000..a05b26c68a
--- /dev/null
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <rte_vect.h>
+
+#include "idpf_common_rxtx.h"
+#include "idpf_common_device.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static __rte_always_inline void
+idpf_singleq_rx_rearm(struct idpf_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+	struct rte_mbuf **rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+	rxdp += rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rxq->mp,
+				 (void *)rxep,
+				 IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			__m128i dma_addr0;
+
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxep[i] = &rxq->fake_mbuf;
+				_mm_store_si128((__m128i *)&rxdp[i].read,
+						dma_addr0);
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed,
+				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+
+		return;
+	}
+
+	struct rte_mbuf *mb0, *mb1;
+	__m128i dma_addr0, dma_addr1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+			RTE_PKTMBUF_HEADROOM);
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		__m128i vaddr0, vaddr1;
+
+		mb0 = rxep[0];
+		mb1 = rxep[1];
+
+		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				offsetof(struct rte_mbuf, buf_addr) + 8);
+		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		/* flush desc with pa dma_addr */
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+}
+
+static inline uint16_t
+_idpf_singleq_recv_raw_pkts_vec_avx2(struct idpf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts, uint8_t *split_packet)
+{
+#define IDPF_DESCS_PER_LOOP_AVX 8
+
+	const uint32_t *ptype_tbl = rxq->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+			0, rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
+
+	rxdp += rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_singleq_rx_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->flex_nic_wb.status_error0 &
+			rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			VIRTCHNL2_RX_FLEX_DESC_STATUS0_EOF_S);
+
+	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+	const __m256i shuf_msk =
+		_mm256_set_epi8
+			(/* first descriptor */
+			 0xFF, 0xFF,
+			 0xFF, 0xFF,	/* rss hash parsed separately */
+			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
+			 5, 4,		/* octet 4~5, 16 bits data_len */
+			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
+			 5, 4,		/* octet 4~5, 16 bits pkt_len */
+			 0xFF, 0xFF,	/* pkt_type set as unknown */
+			 0xFF, 0xFF,	/*pkt_type set as unknown */
+			 /* second descriptor */
+			 0xFF, 0xFF,
+			 0xFF, 0xFF,	/* rss hash parsed separately */
+			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
+			 5, 4,		/* octet 4~5, 16 bits data_len */
+			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
+			 5, 4,		/* octet 4~5, 16 bits pkt_len */
+			 0xFF, 0xFF,	/* pkt_type set as unknown */
+			 0xFF, 0xFF	/*pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except Checksum Reports, RSS indication
+	 * and VLAN indication.
+	 * bit6:4 for IP/L4 checksum errors.
+	 * bit12 is for RSS indication.
+	 * bit13 is for VLAN indication.
+	 */
+	const __m256i flags_mask =
+		 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13));
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 4
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf =
+		_mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
+		 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		  RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		/**
+		 * second 128-bits
+		 * shift right 20 bits to use the low two bits to indicate
+		 * outer checksum status
+		 * shift right 1 bit to make sure it not exceed 255
+		 */
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
+	const __m256i cksum_mask =
+		 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK |
+				   RTE_MBUF_F_RX_L4_CKSUM_MASK |
+				   RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+				   RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK);
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 12.
+	 * If RSS(bit12)/VLAN(bit13) are set,
+	 * shuffle moves appropriate flags in place.
+	 */
+	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_RSS_HASH, 0,
+			/* end up 128-bits */
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_RSS_HASH, 0);
+
+	RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IDPF_DESCS_PER_LOOP_AVX,
+	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
+
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc6_7 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc6),
+				 raw_desc7, 1);
+		raw_desc4_5 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc4),
+				 raw_desc5, 1);
+		raw_desc2_3 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc2),
+				 raw_desc3, 1);
+		raw_desc0_1 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc0),
+				 raw_desc1, 1);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IDPF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
+		__m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
+
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m256i ptype_mask =
+			_mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+		const __m256i ptypes6_7 =
+			_mm256_and_si256(raw_desc6_7, ptype_mask);
+		const __m256i ptypes4_5 =
+			_mm256_and_si256(raw_desc4_5, ptype_mask);
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
+
+		mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype7], 4);
+		mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype6], 0);
+		mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype5], 4);
+		mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype4], 0);
+		/* merge the status bits into one register */
+		const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
+				raw_desc4_5);
+
+		/**
+		 * convert descriptors 0-3 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
+		__m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
+
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m256i ptypes2_3 =
+			_mm256_and_si256(raw_desc2_3, ptype_mask);
+		const __m256i ptypes0_1 =
+			_mm256_and_si256(raw_desc0_1, ptype_mask);
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
+
+		mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype3], 4);
+		mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype2], 0);
+		mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype1], 4);
+		mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype0], 0);
+		/* merge the status bits into one register */
+		const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
+								raw_desc0_1);
+
+		/**
+		 * take the two sets of status bits and merge to one
+		 * After merge, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		__m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
+							  status0_3);
+
+		/* now do flag manipulation */
+
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 4));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+
+		__m256i l4_outer_mask = _mm256_set1_epi32(0x6);
+		__m256i l4_outer_flags =
+				_mm256_and_si256(l3_l4_flags, l4_outer_mask);
+		l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);
+
+		__m256i l3_l4_mask = _mm256_set1_epi32(~0x6);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);
+		l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+		/* set rss and vlan flags */
+		const __m256i rss_vlan_flag_bits =
+			_mm256_srli_epi32(flag_bits, 12);
+		const __m256i rss_vlan_flags =
+			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+					    rss_vlan_flag_bits);
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+				rss_vlan_flags);
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+					   16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_EOF_S);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+					eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IDPF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = rte_popcount64
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += rte_popcount64
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IDPF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
+{
+	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
+}
diff --git a/drivers/common/idpf/meson.build b/drivers/common/idpf/meson.build
index 46fd45c03b..4caa06a9b7 100644
--- a/drivers/common/idpf/meson.build
+++ b/drivers/common/idpf/meson.build
@@ -16,6 +16,21 @@ sources = files(
 )
 
 if arch_subdir == 'x86'
+    # compile AVX2 version if either:
+    # a. we have AVX supported in minimum instruction set baseline
+    # b. it's not minimum instruction set, but supported by compiler
+    if cc.get_define('__AVX2__', args: machine_args) != ''
+        cflags += ['-DCC_AVX2_SUPPORT']
+        sources += files('idpf_common_rxtx_avx2.c')
+    elif cc.has_argument('-mavx2')
+       cflags += ['-DCC_AVX2_SUPPORT']
+        idpf_avx2_lib = static_library('idpf_avx2_lib',
+                'idpf_common_rxtx_avx2.c',
+               dependencies: [static_rte_ethdev, static_rte_kvargs, static_rte_hash],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2'])
+       objs += idpf_avx2_lib.extract_objects('idpf_common_rxtx_avx2.c')
+    endif
     if cc_has_avx512
         cflags += ['-DCC_AVX512_SUPPORT']
         avx512_args = cflags + cc_avx512_flags
diff --git a/drivers/common/idpf/version.map b/drivers/common/idpf/version.map
index 0729f6b912..4510aae6b3 100644
--- a/drivers/common/idpf/version.map
+++ b/drivers/common/idpf/version.map
@@ -14,6 +14,7 @@ INTERNAL {
 	idpf_dp_splitq_recv_pkts_avx512;
 	idpf_dp_splitq_xmit_pkts;
 	idpf_dp_splitq_xmit_pkts_avx512;
+	idpf_dp_singleq_recv_pkts_avx2;
 
 	idpf_qc_rx_thresh_check;
 	idpf_qc_rx_queue_release;
diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c
index 858bbefe3b..80c6c325e8 100644
--- a/drivers/net/idpf/idpf_rxtx.c
+++ b/drivers/net/idpf/idpf_rxtx.c
@@ -776,6 +776,11 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		vport->rx_vec_allowed = true;
 
+		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			vport->rx_use_avx2 = true;
+
 		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
 #ifdef CC_AVX512_SUPPORT
 			if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
@@ -827,6 +832,13 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (vport->rx_use_avx2) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Single AVX2 Vector Rx (port %d).",
+					    dev->data->port_id);
+				dev->rx_pkt_burst = idpf_dp_singleq_recv_pkts_avx2;
+				return;
+			}
 		}
 
 		if (dev->data->scattered_rx) {
-- 
2.34.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 2/2] common/idpf: enable AVX2 for single queue Tx
  2025-01-08 12:17 [PATCH 0/2] enable AVX2 for IDPF single queue Shaiq Wani
  2025-01-08 12:17 ` [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx Shaiq Wani
@ 2025-01-08 12:17 ` Shaiq Wani
  2025-01-20 14:23   ` Bruce Richardson
  1 sibling, 1 reply; 6+ messages in thread
From: Shaiq Wani @ 2025-01-08 12:17 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/rel_notes/release_25_03.rst      |   3 +
 drivers/common/idpf/idpf_common_device.h    |   1 +
 drivers/common/idpf/idpf_common_rxtx.h      |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 225 ++++++++++++++++++++
 drivers/common/idpf/version.map             |   1 +
 drivers/net/idpf/idpf_rxtx.c                |  14 ++
 6 files changed, 248 insertions(+)

diff --git a/doc/guides/rel_notes/release_25_03.rst b/doc/guides/rel_notes/release_25_03.rst
index 426dfcd982..7ded85dac4 100644
--- a/doc/guides/rel_notes/release_25_03.rst
+++ b/doc/guides/rel_notes/release_25_03.rst
@@ -55,6 +55,9 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+   * **Added support of vector instructions on IDPF.**
+
+     Added support of AVX2 instructions in IDPF single queue RX and TX path.
 
 Removed Items
 -------------
diff --git a/drivers/common/idpf/idpf_common_device.h b/drivers/common/idpf/idpf_common_device.h
index 734be1c88a..5f3e4a4fcf 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -124,6 +124,7 @@ struct idpf_vport {
 	bool rx_vec_allowed;
 	bool tx_vec_allowed;
 	bool rx_use_avx2;
+	bool tx_use_avx2;
 	bool rx_use_avx512;
 	bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h b/drivers/common/idpf/idpf_common_rxtx.h
index f50cf5ef46..e19e1878f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -306,5 +306,9 @@ __rte_internal
 uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
+					struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c b/drivers/common/idpf/idpf_common_rxtx_avx2.c
index a05b26c68a..a4bc8e2bef 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -588,3 +588,228 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 {
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
+
+static __rte_always_inline void
+idpf_tx_backlog_entry(struct idpf_tx_entry *txep,
+		     struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	int i;
+
+	for (i = 0; i < (int)nb_pkts; ++i)
+		txep[i].mbuf = tx_pkts[i];
+}
+
+static __rte_always_inline int
+idpf_singleq_tx_free_bufs_vec(struct idpf_tx_queue *txq)
+{
+	struct idpf_tx_entry *txep;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[txq->rs_thresh];
+
+	/* check DD bits on threshold descriptor */
+	if ((txq->tx_ring[txq->next_dd].qw1 &
+			rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
+			rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
+		return 0;
+
+	n = txq->rs_thresh;
+
+	 /* first buffer to free from S/W ring is at index
+	  * next_dd - (rs_thresh-1)
+	  */
+	txep = &txq->sw_ring[txq->next_dd - (n - 1)];
+	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m)) {
+				if (likely(m->pool == free[0]->pool)) {
+					free[nb_free++] = m;
+				} else {
+					rte_mempool_put_bulk(free[0]->pool,
+							     (void *)free,
+							     nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+	/* buffers were freed, update counters */
+	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
+	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
+	if (txq->next_dd >= txq->nb_tx_desc)
+		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
+
+	return txq->rs_thresh;
+}
+
+static inline void
+idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
+		  struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		(IDPF_TX_DESC_DTYPE_DATA |
+		 ((uint64_t)flags  << IDPF_TXD_QW1_CMD_S) |
+		 ((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S));
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+				pkt->buf_iova + pkt->data_off);
+	_mm_store_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_singleq_vtx(volatile struct idpf_base_tx_desc *txdp,
+		 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_DATA |
+			((uint64_t)flags  << IDPF_TXD_QW1_CMD_S));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_singleq_vtx1(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do two at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc2_3 =
+			_mm256_set_epi64x
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off);
+		__m256i desc0_1 =
+			_mm256_set_epi64x
+				(hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm256_store_si256((void *)(txdp + 2), desc2_3);
+		_mm256_store_si256((void *)txdp, desc0_1);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		idpf_singleq_vtx1(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static inline uint16_t
+idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts)
+{
+	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
+	volatile struct idpf_base_tx_desc *txdp;
+	struct idpf_tx_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
+	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
+
+	if (txq->nb_free < txq->free_thresh)
+		idpf_singleq_tx_free_bufs_vec(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring[tx_id];
+
+	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		idpf_tx_backlog_entry(txep, tx_pkts, n);
+
+		idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_singleq_vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = &txq->sw_ring[tx_id];
+	}
+
+	idpf_tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->next_rs) {
+		txq->tx_ring[txq->next_rs].qw1 |=
+			rte_cpu_to_le_64(((uint64_t)IDPF_TX_DESC_CMD_RS) <<
+					 IDPF_TXD_QW1_CMD_S);
+		txq->next_rs =
+			(uint16_t)(txq->next_rs + txq->rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+			       uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
+		ret = idpf_singleq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx],
+						    num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/common/idpf/version.map b/drivers/common/idpf/version.map
index 4510aae6b3..eadcb9a2cf 100644
--- a/drivers/common/idpf/version.map
+++ b/drivers/common/idpf/version.map
@@ -15,6 +15,7 @@ INTERNAL {
 	idpf_dp_splitq_xmit_pkts;
 	idpf_dp_splitq_xmit_pkts_avx512;
 	idpf_dp_singleq_recv_pkts_avx2;
+	idpf_dp_singleq_xmit_pkts_avx2;
 
 	idpf_qc_rx_thresh_check;
 	idpf_qc_rx_queue_release;
diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c
index 80c6c325e8..579293b2e8 100644
--- a/drivers/net/idpf/idpf_rxtx.c
+++ b/drivers/net/idpf/idpf_rxtx.c
@@ -888,6 +888,12 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 	if (idpf_tx_vec_dev_check_default(dev) == IDPF_VECTOR_PATH &&
 	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		vport->tx_vec_allowed = true;
+
+		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			vport->tx_use_avx2 = true;
+
 		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
 #ifdef CC_AVX512_SUPPORT
 		{
@@ -947,6 +953,14 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (vport->tx_use_avx2) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Single AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_singleq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Single Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx
  2025-01-08 12:17 ` [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx Shaiq Wani
@ 2025-01-20 14:15   ` Bruce Richardson
  0 siblings, 0 replies; 6+ messages in thread
From: Bruce Richardson @ 2025-01-20 14:15 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Wed, Jan 08, 2025 at 05:47:56PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>

Hi,

some review comments inline below.

/Bruce

> ---
>  drivers/common/idpf/idpf_common_device.h    |   1 +
>  drivers/common/idpf/idpf_common_rxtx.h      |   4 +
>  drivers/common/idpf/idpf_common_rxtx_avx2.c | 590 ++++++++++++++++++++
>  drivers/common/idpf/meson.build             |  15 +
>  drivers/common/idpf/version.map             |   1 +
>  drivers/net/idpf/idpf_rxtx.c                |  12 +
>  6 files changed, 623 insertions(+)
>  create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c
> 
> diff --git a/drivers/common/idpf/idpf_common_device.h b/drivers/common/idpf/idpf_common_device.h
> index bfa927a5ff..734be1c88a 100644
> --- a/drivers/common/idpf/idpf_common_device.h
> +++ b/drivers/common/idpf/idpf_common_device.h
> @@ -123,6 +123,7 @@ struct idpf_vport {
>  
>  	bool rx_vec_allowed;
>  	bool tx_vec_allowed;
> +	bool rx_use_avx2;
>  	bool rx_use_avx512;
>  	bool tx_use_avx512;
>  
> diff --git a/drivers/common/idpf/idpf_common_rxtx.h b/drivers/common/idpf/idpf_common_rxtx.h
> index eeeeed12e2..f50cf5ef46 100644
> --- a/drivers/common/idpf/idpf_common_rxtx.h
> +++ b/drivers/common/idpf/idpf_common_rxtx.h
> @@ -302,5 +302,9 @@ uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pk
>  __rte_internal
>  uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			  uint16_t nb_pkts);
> +__rte_internal
> +uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
> +					struct rte_mbuf **rx_pkts,
> +					uint16_t nb_pkts);
>  

I'm a little confused by the "singleq" part of the name here, can you
explain a little (in the commit message, perhaps) what is the "single"
referring to? Does the driver have the ability to poll multiple queues at
once or something?

>  #endif /* _IDPF_COMMON_RXTX_H_ */
> diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c b/drivers/common/idpf/idpf_common_rxtx_avx2.c
> new file mode 100644
> index 0000000000..a05b26c68a
> --- /dev/null
> +++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
> @@ -0,0 +1,590 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Intel Corporation
> + */
> +
> +#include <rte_vect.h>
> +
> +#include "idpf_common_rxtx.h"
> +#include "idpf_common_device.h"
> +
> +#ifndef __INTEL_COMPILER
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +#endif

There is work ongoing to stop using this warning removal [1]. This code may
need to be rebased on top of that if it's applied soon.

[1] https://patches.dpdk.org/project/dpdk/list/?series=34390

> +
> +static __rte_always_inline void
> +idpf_singleq_rx_rearm(struct idpf_rx_queue *rxq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
> +	struct rte_mbuf **rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> +	rxdp += rxq->rxrearm_start;
> +
> +	/* Pull 'n' more MBUFs into the software ring */
> +	if (rte_mempool_get_bulk(rxq->mp,
> +				 (void *)rxep,
> +				 IDPF_RXQ_REARM_THRESH) < 0) {
> +		if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> +		    rxq->nb_rx_desc) {
> +			__m128i dma_addr0;
> +
> +			dma_addr0 = _mm_setzero_si128();
> +			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +				rxep[i] = &rxq->fake_mbuf;
> +				_mm_store_si128((__m128i *)&rxdp[i].read,
> +						dma_addr0);
> +			}
> +		}
> +		rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed,
> +				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +
> +		return;
> +	}
> +
> +	struct rte_mbuf *mb0, *mb1;
> +	__m128i dma_addr0, dma_addr1;
> +	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
> +			RTE_PKTMBUF_HEADROOM);
> +	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2) {
> +		__m128i vaddr0, vaddr1;
> +
> +		mb0 = rxep[0];
> +		mb1 = rxep[1];
> +
> +		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +				offsetof(struct rte_mbuf, buf_addr) + 8);
> +		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		/* convert pa to dma_addr hdr/data */
> +		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
> +
> +		/* add headroom to pa values */
> +		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
> +
> +		/* flush desc with pa dma_addr */
> +		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> +		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
> +	}
> +
> +	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
> +		rxq->rxrearm_start = 0;
> +
> +	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
> +			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
> +
> +	/* Update the tail pointer on the NIC */
> +	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
> +}
> +
> +static inline uint16_t
> +_idpf_singleq_recv_raw_pkts_vec_avx2(struct idpf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> +				     uint16_t nb_pkts, uint8_t *split_packet)
> +{
> +#define IDPF_DESCS_PER_LOOP_AVX 8
> +
> +	const uint32_t *ptype_tbl = rxq->adapter->ptype_tbl;
> +	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> +			0, rxq->mbuf_initializer);
> +	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
> +	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
> +	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
> +
> +	rxdp += rxq->rx_tail;
> +
> +	rte_prefetch0(rxdp);
> +
> +	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
> +
> +	/* See if we need to rearm the RX queue - gives the prefetch a bit
> +	 * of time to act
> +	 */
> +	if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
> +		idpf_singleq_rx_rearm(rxq);
> +
> +	/* Before we start moving massive data around, check to see if
> +	 * there is actually a packet available
> +	 */
> +	if (!(rxdp->flex_nic_wb.status_error0 &
> +			rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)))
> +		return 0;
> +
> +	/* 8 packets DD mask, LSB in each 32-bit value */
> +	const __m256i dd_check = _mm256_set1_epi32(1);
> +
> +	/* 8 packets EOP mask, second-LSB in each 32-bit value */
> +	const __m256i eop_check = _mm256_slli_epi32(dd_check,
> +			VIRTCHNL2_RX_FLEX_DESC_STATUS0_EOF_S);
> +
> +	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
> +	const __m256i shuf_msk =
> +		_mm256_set_epi8
> +			(/* first descriptor */
> +			 0xFF, 0xFF,
> +			 0xFF, 0xFF,	/* rss hash parsed separately */
> +			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
> +			 5, 4,		/* octet 4~5, 16 bits data_len */
> +			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
> +			 5, 4,		/* octet 4~5, 16 bits pkt_len */
> +			 0xFF, 0xFF,	/* pkt_type set as unknown */
> +			 0xFF, 0xFF,	/*pkt_type set as unknown */
> +			 /* second descriptor */
> +			 0xFF, 0xFF,
> +			 0xFF, 0xFF,	/* rss hash parsed separately */
> +			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
> +			 5, 4,		/* octet 4~5, 16 bits data_len */
> +			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
> +			 5, 4,		/* octet 4~5, 16 bits pkt_len */
> +			 0xFF, 0xFF,	/* pkt_type set as unknown */
> +			 0xFF, 0xFF	/*pkt_type set as unknown */
> +			);
> +	/**
> +	 * compile-time check the above crc and shuffle layout is correct.
> +	 * NOTE: the first field (lowest address) is given last in set_epi
> +	 * calls above.
> +	 */
> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
> +			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
> +			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
> +			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
> +			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
> +
> +	/* Status/Error flag masks */
> +	/**
> +	 * mask everything except Checksum Reports, RSS indication
> +	 * and VLAN indication.
> +	 * bit6:4 for IP/L4 checksum errors.
> +	 * bit12 is for RSS indication.
> +	 * bit13 is for VLAN indication.
> +	 */
> +	const __m256i flags_mask =
> +		 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13));
> +	/**
> +	 * data to be shuffled by the result of the flags mask shifted by 4
> +	 * bits.  This gives use the l3_l4 flags.
> +	 */
> +	const __m256i l3_l4_flags_shuf =
> +		_mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
> +		 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
> +		  RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		/**
> +		 * second 128-bits
> +		 * shift right 20 bits to use the low two bits to indicate
> +		 * outer checksum status
> +		 * shift right 1 bit to make sure it not exceed 255
> +		 */
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
> +	const __m256i cksum_mask =
> +		 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK |
> +				   RTE_MBUF_F_RX_L4_CKSUM_MASK |
> +				   RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
> +				   RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK);
> +	/**
> +	 * data to be shuffled by result of flag mask, shifted down 12.
> +	 * If RSS(bit12)/VLAN(bit13) are set,
> +	 * shuffle moves appropriate flags in place.
> +	 */
> +	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
> +			0, 0, 0, 0,
> +			0, 0, 0, 0,
> +			RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
> +			RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
> +			RTE_MBUF_F_RX_RSS_HASH, 0,
> +			/* end up 128-bits */
> +			0, 0, 0, 0,
> +			0, 0, 0, 0,
> +			0, 0, 0, 0,
> +			RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
> +			RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
> +			RTE_MBUF_F_RX_RSS_HASH, 0);
> +
> +	RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */
> +

Does the driver or HW support 16B descriptors? If not, just remove this
variable completely. Don't keep it just for consistency with other drivers.

> +	uint16_t i, received;
> +
> +	for (i = 0, received = 0; i < nb_pkts;
> +	     i += IDPF_DESCS_PER_LOOP_AVX,
> +	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
> +		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
> +		_mm256_storeu_si256((void *)&rx_pkts[i],
> +				    _mm256_loadu_si256((void *)&sw_ring[i]));
> +#ifdef RTE_ARCH_X86_64
> +		_mm256_storeu_si256
> +			((void *)&rx_pkts[i + 4],
> +			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
> +#endif
> +
> +		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
> +
> +		const __m128i raw_desc7 =
> +			_mm_load_si128((void *)(rxdp + 7));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc6 =
> +			_mm_load_si128((void *)(rxdp + 6));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc5 =
> +			_mm_load_si128((void *)(rxdp + 5));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc4 =
> +			_mm_load_si128((void *)(rxdp + 4));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc3 =
> +			_mm_load_si128((void *)(rxdp + 3));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc2 =
> +			_mm_load_si128((void *)(rxdp + 2));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc1 =
> +			_mm_load_si128((void *)(rxdp + 1));
> +		rte_compiler_barrier();
> +		const __m128i raw_desc0 =
> +			_mm_load_si128((void *)(rxdp + 0));
> +

Here and a number of other places, I think you can reduce the amount of
word-wrapping being done. Unlike when the first AVX2 code was being
written, we now can use up to 100 characters be line.

> +		raw_desc6_7 =
> +			_mm256_inserti128_si256
> +				(_mm256_castsi128_si256(raw_desc6),
> +				 raw_desc7, 1);
> +		raw_desc4_5 =
> +			_mm256_inserti128_si256
> +				(_mm256_castsi128_si256(raw_desc4),
> +				 raw_desc5, 1);
> +		raw_desc2_3 =
> +			_mm256_inserti128_si256
> +				(_mm256_castsi128_si256(raw_desc2),
> +				 raw_desc3, 1);
> +		raw_desc0_1 =
> +			_mm256_inserti128_si256
> +				(_mm256_castsi128_si256(raw_desc0),
> +				 raw_desc1, 1);
> +
> +		if (split_packet) {
> +			int j;
> +
> +			for (j = 0; j < IDPF_DESCS_PER_LOOP_AVX; j++)
> +				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
> +		}
> +

I don't see any use of buffer reassembly for multi-segment packets in the
driver code. If it's not planned to add this, then you can remove this
block. If it is planned to add it, then keep this, and you can base the
implementation on the common function being extracted out of other
drivers[2].

[2] https://patches.dpdk.org/project/dpdk/patch/20250120120016.1530274-3-bruce.richardson@intel.com/


> +		/**
> +		 * convert descriptors 4-7 into mbufs, re-arrange fields.
> +		 * Then write into the mbuf.
> +		 */
> +		__m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
> +		__m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
> +
> +		/**
> +		 * to get packet types, ptype is located in bit16-25
> +		 * of each 128bits
> +		 */
> +		const __m256i ptype_mask =
> +			_mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
> +		const __m256i ptypes6_7 =
> +			_mm256_and_si256(raw_desc6_7, ptype_mask);
> +		const __m256i ptypes4_5 =
> +			_mm256_and_si256(raw_desc4_5, ptype_mask);
> +		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
> +		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
> +		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
> +		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
> +
> +		mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype7], 4);
> +		mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype6], 0);
> +		mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype5], 4);
> +		mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype4], 0);
> +		/* merge the status bits into one register */
> +		const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
> +				raw_desc4_5);
> +
> +		/**
> +		 * convert descriptors 0-3 into mbufs, re-arrange fields.
> +		 * Then write into the mbuf.
> +		 */
> +		__m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
> +		__m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
> +
> +		/**
> +		 * to get packet types, ptype is located in bit16-25
> +		 * of each 128bits
> +		 */
> +		const __m256i ptypes2_3 =
> +			_mm256_and_si256(raw_desc2_3, ptype_mask);
> +		const __m256i ptypes0_1 =
> +			_mm256_and_si256(raw_desc0_1, ptype_mask);
> +		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
> +		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
> +		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
> +		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
> +
> +		mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype3], 4);
> +		mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype2], 0);
> +		mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype1], 4);
> +		mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype0], 0);
> +		/* merge the status bits into one register */
> +		const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
> +								raw_desc0_1);
> +
> +		/**
> +		 * take the two sets of status bits and merge to one
> +		 * After merge, the packets status flags are in the
> +		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
> +		 */
> +		__m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
> +							  status0_3);
> +
> +		/* now do flag manipulation */
> +
> +		/* get only flag/error bits we want */
> +		const __m256i flag_bits =
> +			_mm256_and_si256(status0_7, flags_mask);
> +		/**
> +		 * l3_l4_error flags, shuffle, then shift to correct adjustment
> +		 * of flags in flags_shuf, and finally mask out extra bits
> +		 */
> +		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
> +				_mm256_srli_epi32(flag_bits, 4));
> +		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
> +
> +		__m256i l4_outer_mask = _mm256_set1_epi32(0x6);
> +		__m256i l4_outer_flags =
> +				_mm256_and_si256(l3_l4_flags, l4_outer_mask);
> +		l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);
> +
> +		__m256i l3_l4_mask = _mm256_set1_epi32(~0x6);
> +		l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);
> +		l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);
> +		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
> +		/* set rss and vlan flags */
> +		const __m256i rss_vlan_flag_bits =
> +			_mm256_srli_epi32(flag_bits, 12);
> +		const __m256i rss_vlan_flags =
> +			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
> +					    rss_vlan_flag_bits);
> +
> +		/* merge flags */
> +		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
> +				rss_vlan_flags);
> +
> +		/**
> +		 * At this point, we have the 8 sets of flags in the low 16-bits
> +		 * of each 32-bit value in vlan0.
> +		 * We want to extract these, and merge them with the mbuf init
> +		 * data so we can do a single write to the mbuf to set the flags
> +		 * and all the other initialization fields. Extracting the
> +		 * appropriate flags means that we have to do a shift and blend
> +		 * for each mbuf before we do the write. However, we can also
> +		 * add in the previously computed rx_descriptor fields to
> +		 * make a single 256-bit write per mbuf
> +		 */
> +		/* check the structure matches expectations */
> +		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
> +				 offsetof(struct rte_mbuf, rearm_data) + 8);
> +		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
> +				 RTE_ALIGN(offsetof(struct rte_mbuf,
> +						    rearm_data),
> +					   16));
> +		/* build up data and do writes */
> +		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
> +			rearm6, rearm7;
> +		rearm6 = _mm256_blend_epi32(mbuf_init,
> +					    _mm256_slli_si256(mbuf_flags, 8),
> +					    0x04);
> +		rearm4 = _mm256_blend_epi32(mbuf_init,
> +					    _mm256_slli_si256(mbuf_flags, 4),
> +					    0x04);
> +		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
> +		rearm0 = _mm256_blend_epi32(mbuf_init,
> +					    _mm256_srli_si256(mbuf_flags, 4),
> +					    0x04);
> +		/* permute to add in the rx_descriptor e.g. rss fields */
> +		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
> +		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
> +		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
> +		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
> +		/* write to mbuf */
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
> +				    rearm6);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
> +				    rearm4);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
> +				    rearm2);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
> +				    rearm0);
> +
> +		/* repeat for the odd mbufs */
> +		const __m256i odd_flags =
> +			_mm256_castsi128_si256
> +				(_mm256_extracti128_si256(mbuf_flags, 1));
> +		rearm7 = _mm256_blend_epi32(mbuf_init,
> +					    _mm256_slli_si256(odd_flags, 8),
> +					    0x04);
> +		rearm5 = _mm256_blend_epi32(mbuf_init,
> +					    _mm256_slli_si256(odd_flags, 4),
> +					    0x04);
> +		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
> +		rearm1 = _mm256_blend_epi32(mbuf_init,
> +					    _mm256_srli_si256(odd_flags, 4),
> +					    0x04);
> +		/* since odd mbufs are already in hi 128-bits use blend */
> +		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
> +		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
> +		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
> +		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
> +		/* again write to mbufs */
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
> +				    rearm7);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
> +				    rearm5);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
> +				    rearm3);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
> +				    rearm1);
> +
> +		/* extract and record EOP bit */
> +		if (split_packet) {
> +			const __m128i eop_mask =
> +				_mm_set1_epi16(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_EOF_S);
> +			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
> +								     eop_check);
> +			/* pack status bits into a single 128-bit register */
> +			const __m128i eop_bits =
> +				_mm_packus_epi32
> +					(_mm256_castsi256_si128(eop_bits256),
> +					 _mm256_extractf128_si256(eop_bits256,
> +								  1));
> +			/**
> +			 * flip bits, and mask out the EOP bit, which is now
> +			 * a split-packet bit i.e. !EOP, rather than EOP one.
> +			 */
> +			__m128i split_bits = _mm_andnot_si128(eop_bits,
> +					eop_mask);
> +			/**
> +			 * eop bits are out of order, so we need to shuffle them
> +			 * back into order again. In doing so, only use low 8
> +			 * bits, which acts like another pack instruction
> +			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
> +			 * [Since we use epi8, the 16-bit positions are
> +			 * multiplied by 2 in the eop_shuffle value.]
> +			 */
> +			__m128i eop_shuffle =
> +				_mm_set_epi8(/* zero hi 64b */
> +					     0xFF, 0xFF, 0xFF, 0xFF,
> +					     0xFF, 0xFF, 0xFF, 0xFF,
> +					     /* move values to lo 64b */
> +					     8, 0, 10, 2,
> +					     12, 4, 14, 6);
> +			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
> +			*(uint64_t *)split_packet =
> +				_mm_cvtsi128_si64(split_bits);
> +			split_packet += IDPF_DESCS_PER_LOOP_AVX;
> +		}

As above, if there are no plans to support multi-buffer packet reassembly,
drop this.

> +
> +		/* perform dd_check */
> +		status0_7 = _mm256_and_si256(status0_7, dd_check);
> +		status0_7 = _mm256_packs_epi32(status0_7,
> +					       _mm256_setzero_si256());
> +
> +		uint64_t burst = rte_popcount64
> +					(_mm_cvtsi128_si64
> +						(_mm256_extracti128_si256
> +							(status0_7, 1)));
> +		burst += rte_popcount64
> +				(_mm_cvtsi128_si64
> +					(_mm256_castsi256_si128(status0_7)));
> +		received += burst;
> +		if (burst != IDPF_DESCS_PER_LOOP_AVX)
> +			break;
> +	}
> +
> +	/* update tail pointers */
> +	rxq->rx_tail += received;
> +	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
> +	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
> +		rxq->rx_tail--;
> +		received--;
> +	}
> +	rxq->rxrearm_nb += received;
> +	return received;
> +}
> +
> +/**
> + * Notice:
> + * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet
> + */
> +uint16_t
> +idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> +			       uint16_t nb_pkts)
> +{
> +	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
> +}
> diff --git a/drivers/common/idpf/meson.build b/drivers/common/idpf/meson.build
> index 46fd45c03b..4caa06a9b7 100644
> --- a/drivers/common/idpf/meson.build
> +++ b/drivers/common/idpf/meson.build
> @@ -16,6 +16,21 @@ sources = files(
>  )
>  
>  if arch_subdir == 'x86'
> +    # compile AVX2 version if either:
> +    # a. we have AVX supported in minimum instruction set baseline
> +    # b. it's not minimum instruction set, but supported by compiler
> +    if cc.get_define('__AVX2__', args: machine_args) != ''
> +        cflags += ['-DCC_AVX2_SUPPORT']
> +        sources += files('idpf_common_rxtx_avx2.c')
> +    elif cc.has_argument('-mavx2')

This logic is out-of-date, since all supported compilers have AVX2 support.
Suggest reworking using drivers/net/ice/meson.build as a reference.

> +       cflags += ['-DCC_AVX2_SUPPORT']
> +        idpf_avx2_lib = static_library('idpf_avx2_lib',
> +                'idpf_common_rxtx_avx2.c',
> +               dependencies: [static_rte_ethdev, static_rte_kvargs, static_rte_hash],
> +                include_directories: includes,
> +                c_args: [cflags, '-mavx2'])
> +       objs += idpf_avx2_lib.extract_objects('idpf_common_rxtx_avx2.c')
> +    endif
>      if cc_has_avx512
>          cflags += ['-DCC_AVX512_SUPPORT']
>          avx512_args = cflags + cc_avx512_flags
> diff --git a/drivers/common/idpf/version.map b/drivers/common/idpf/version.map
> index 0729f6b912..4510aae6b3 100644
> --- a/drivers/common/idpf/version.map
> +++ b/drivers/common/idpf/version.map
> @@ -14,6 +14,7 @@ INTERNAL {
>  	idpf_dp_splitq_recv_pkts_avx512;
>  	idpf_dp_splitq_xmit_pkts;
>  	idpf_dp_splitq_xmit_pkts_avx512;
> +	idpf_dp_singleq_recv_pkts_avx2;
>  

This list should be alphabetical, so singleq should go before splitq.

>  	idpf_qc_rx_thresh_check;
>  	idpf_qc_rx_queue_release;
> diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c
> index 858bbefe3b..80c6c325e8 100644
> --- a/drivers/net/idpf/idpf_rxtx.c
> +++ b/drivers/net/idpf/idpf_rxtx.c
> @@ -776,6 +776,11 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
>  	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
>  		vport->rx_vec_allowed = true;
>  
> +		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
> +		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&

There are no CPUs that support AVX512 witout supporting AVX2 - and if there
were we probably couldn't use an AVX2 code path on them anyway. Therefore
only check the AVX2 flag and the bitwidth.

> +		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
> +			vport->rx_use_avx2 = true;
> +
>  		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
>  #ifdef CC_AVX512_SUPPORT
>  			if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
> @@ -827,6 +832,13 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
>  				return;
>  			}
>  #endif /* CC_AVX512_SUPPORT */
> +			if (vport->rx_use_avx2) {
> +				PMD_DRV_LOG(NOTICE,
> +					    "Using Single AVX2 Vector Rx (port %d).",
> +					    dev->data->port_id);
> +				dev->rx_pkt_burst = idpf_dp_singleq_recv_pkts_avx2;
> +				return;
> +			}
>  		}
>  
>  		if (dev->data->scattered_rx) {
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] common/idpf: enable AVX2 for single queue Tx
  2025-01-08 12:17 ` [PATCH 2/2] common/idpf: enable AVX2 for single queue Tx Shaiq Wani
@ 2025-01-20 14:23   ` Bruce Richardson
  0 siblings, 0 replies; 6+ messages in thread
From: Bruce Richardson @ 2025-01-20 14:23 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Wed, Jan 08, 2025 at 05:47:57PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---

Hi,

some review comments inline below.

/Bruce

>  doc/guides/rel_notes/release_25_03.rst      |   3 +
>  drivers/common/idpf/idpf_common_device.h    |   1 +
>  drivers/common/idpf/idpf_common_rxtx.h      |   4 +
>  drivers/common/idpf/idpf_common_rxtx_avx2.c | 225 ++++++++++++++++++++
>  drivers/common/idpf/version.map             |   1 +
>  drivers/net/idpf/idpf_rxtx.c                |  14 ++
>  6 files changed, 248 insertions(+)
> 
> diff --git a/doc/guides/rel_notes/release_25_03.rst b/doc/guides/rel_notes/release_25_03.rst
> index 426dfcd982..7ded85dac4 100644
> --- a/doc/guides/rel_notes/release_25_03.rst
> +++ b/doc/guides/rel_notes/release_25_03.rst
> @@ -55,6 +55,9 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
>  
> +   * **Added support of vector instructions on IDPF.**
> +
> +     Added support of AVX2 instructions in IDPF single queue RX and TX path.
>  

Driver already had vector instructions so title is a little misleading.
Clarify the title to be AVX2-specific. For the body, please clarify singleq
vs splitq and what the differences are and when one might get the benefit
of the AVX2 code path.

>  Removed Items
>  -------------
> diff --git a/drivers/common/idpf/idpf_common_device.h b/drivers/common/idpf/idpf_common_device.h
> index 734be1c88a..5f3e4a4fcf 100644
> --- a/drivers/common/idpf/idpf_common_device.h
> +++ b/drivers/common/idpf/idpf_common_device.h
> @@ -124,6 +124,7 @@ struct idpf_vport {
>  	bool rx_vec_allowed;
>  	bool tx_vec_allowed;

Do we have vector paths other than the 2 AVX ones below. If not, why do we
need this flag?

>  	bool rx_use_avx2;
> +	bool tx_use_avx2;
>  	bool rx_use_avx512;
>  	bool tx_use_avx512;
>  
> diff --git a/drivers/common/idpf/idpf_common_rxtx.h b/drivers/common/idpf/idpf_common_rxtx.h
> index f50cf5ef46..e19e1878f3 100644
> --- a/drivers/common/idpf/idpf_common_rxtx.h
> +++ b/drivers/common/idpf/idpf_common_rxtx.h
> @@ -306,5 +306,9 @@ __rte_internal
>  uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
>  					struct rte_mbuf **rx_pkts,
>  					uint16_t nb_pkts);
> +__rte_internal
> +uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
> +					struct rte_mbuf **tx_pkts,
> +					uint16_t nb_pkts);
>  
>  #endif /* _IDPF_COMMON_RXTX_H_ */
> diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c b/drivers/common/idpf/idpf_common_rxtx_avx2.c
> index a05b26c68a..a4bc8e2bef 100644
> --- a/drivers/common/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
> @@ -588,3 +588,228 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
>  {
>  	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
>  }
> +
> +static __rte_always_inline void
> +idpf_tx_backlog_entry(struct idpf_tx_entry *txep,
> +		     struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> +{
> +	int i;
> +
> +	for (i = 0; i < (int)nb_pkts; ++i)
> +		txep[i].mbuf = tx_pkts[i];
> +}
> +
> +static __rte_always_inline int
> +idpf_singleq_tx_free_bufs_vec(struct idpf_tx_queue *txq)
> +{
> +	struct idpf_tx_entry *txep;
> +	uint32_t n;
> +	uint32_t i;
> +	int nb_free = 0;
> +	struct rte_mbuf *m, *free[txq->rs_thresh];
> +
> +	/* check DD bits on threshold descriptor */
> +	if ((txq->tx_ring[txq->next_dd].qw1 &
> +			rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
> +			rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
> +		return 0;
> +
> +	n = txq->rs_thresh;
> +
> +	 /* first buffer to free from S/W ring is at index
> +	  * next_dd - (rs_thresh-1)
> +	  */
> +	txep = &txq->sw_ring[txq->next_dd - (n - 1)];
> +	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
> +	if (likely(m)) {
> +		free[0] = m;
> +		nb_free = 1;
> +		for (i = 1; i < n; i++) {
> +			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> +			if (likely(m)) {
> +				if (likely(m->pool == free[0]->pool)) {
> +					free[nb_free++] = m;
> +				} else {
> +					rte_mempool_put_bulk(free[0]->pool,
> +							     (void *)free,
> +							     nb_free);
> +					free[0] = m;
> +					nb_free = 1;
> +				}
> +			}
> +		}
> +		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
> +	} else {
> +		for (i = 1; i < n; i++) {
> +			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> +			if (m)
> +				rte_mempool_put(m->pool, m);
> +		}
> +	}
> +
> +	/* buffers were freed, update counters */
> +	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
> +	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
> +	if (txq->next_dd >= txq->nb_tx_desc)
> +		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
> +
> +	return txq->rs_thresh;
> +}
> +

If/when patchset [1] is merged, this code should be reworked to use the
common functions.

[1] https://patches.dpdk.org/project/dpdk/list/?series=34398

> +static inline void
> +idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
> +		  struct rte_mbuf *pkt, uint64_t flags)
> +{
> +	uint64_t high_qw =
> +		(IDPF_TX_DESC_DTYPE_DATA |
> +		 ((uint64_t)flags  << IDPF_TXD_QW1_CMD_S) |
> +		 ((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S));
> +
> +	__m128i descriptor = _mm_set_epi64x(high_qw,
> +				pkt->buf_iova + pkt->data_off);
> +	_mm_store_si128((__m128i *)txdp, descriptor);
> +}
> +
> +static inline void
> +idpf_singleq_vtx(volatile struct idpf_base_tx_desc *txdp,
> +		 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
> +{
> +	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_DATA |
> +			((uint64_t)flags  << IDPF_TXD_QW1_CMD_S));
> +
> +	/* if unaligned on 32-bit boundary, do one to align */
> +	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
> +		idpf_singleq_vtx1(txdp, *pkt, flags);
> +		nb_pkts--, txdp++, pkt++;
> +	}
> +
> +	/* do two at a time while possible, in bursts */
> +	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
> +		uint64_t hi_qw3 =
> +			hi_qw_tmpl |
> +			((uint64_t)pkt[3]->data_len <<
> +			 IDPF_TXD_QW1_TX_BUF_SZ_S);
> +		uint64_t hi_qw2 =
> +			hi_qw_tmpl |
> +			((uint64_t)pkt[2]->data_len <<
> +			 IDPF_TXD_QW1_TX_BUF_SZ_S);
> +		uint64_t hi_qw1 =
> +			hi_qw_tmpl |
> +			((uint64_t)pkt[1]->data_len <<
> +			 IDPF_TXD_QW1_TX_BUF_SZ_S);
> +		uint64_t hi_qw0 =
> +			hi_qw_tmpl |
> +			((uint64_t)pkt[0]->data_len <<
> +			 IDPF_TXD_QW1_TX_BUF_SZ_S);
> +
> +		__m256i desc2_3 =
> +			_mm256_set_epi64x
> +				(hi_qw3,
> +				 pkt[3]->buf_iova + pkt[3]->data_off,
> +				 hi_qw2,
> +				 pkt[2]->buf_iova + pkt[2]->data_off);
> +		__m256i desc0_1 =
> +			_mm256_set_epi64x
> +				(hi_qw1,
> +				 pkt[1]->buf_iova + pkt[1]->data_off,
> +				 hi_qw0,
> +				 pkt[0]->buf_iova + pkt[0]->data_off);
> +		_mm256_store_si256((void *)(txdp + 2), desc2_3);
> +		_mm256_store_si256((void *)txdp, desc0_1);
> +	}
> +
> +	/* do any last ones */
> +	while (nb_pkts) {
> +		idpf_singleq_vtx1(txdp, *pkt, flags);
> +		txdp++, pkt++, nb_pkts--;
> +	}
> +}
> +
> +static inline uint16_t
> +idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
> +				       uint16_t nb_pkts)
> +{
> +	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
> +	volatile struct idpf_base_tx_desc *txdp;
> +	struct idpf_tx_entry *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
> +	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
> +
> +	/* cross rx_thresh boundary is not allowed */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
> +
> +	if (txq->nb_free < txq->free_thresh)
> +		idpf_singleq_tx_free_bufs_vec(txq);
> +
> +	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
> +	if (unlikely(nb_pkts == 0))
> +		return 0;
> +
> +	tx_id = txq->tx_tail;
> +	txdp = &txq->tx_ring[tx_id];
> +	txep = &txq->sw_ring[tx_id];
> +
> +	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
> +
> +	n = (uint16_t)(txq->nb_tx_desc - tx_id);
> +	if (nb_commit >= n) {
> +		idpf_tx_backlog_entry(txep, tx_pkts, n);
> +
> +		idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags);
> +		tx_pkts += (n - 1);
> +		txdp += (n - 1);
> +
> +		idpf_singleq_vtx1(txdp, *tx_pkts++, rs);
> +
> +		nb_commit = (uint16_t)(nb_commit - n);
> +
> +		tx_id = 0;
> +		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
> +
> +		/* avoid reach the end of ring */
> +		txdp = &txq->tx_ring[tx_id];
> +		txep = &txq->sw_ring[tx_id];
> +	}
> +
> +	idpf_tx_backlog_entry(txep, tx_pkts, nb_commit);
> +
> +	idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags);
> +
> +	tx_id = (uint16_t)(tx_id + nb_commit);
> +	if (tx_id > txq->next_rs) {
> +		txq->tx_ring[txq->next_rs].qw1 |=
> +			rte_cpu_to_le_64(((uint64_t)IDPF_TX_DESC_CMD_RS) <<
> +					 IDPF_TXD_QW1_CMD_S);
> +		txq->next_rs =
> +			(uint16_t)(txq->next_rs + txq->rs_thresh);
> +	}
> +
> +	txq->tx_tail = tx_id;
> +
> +	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
> +
> +	return nb_pkts;
> +}
> +
> +uint16_t
> +idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			       uint16_t nb_pkts)
> +{
> +	uint16_t nb_tx = 0;
> +	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
> +
> +	while (nb_pkts) {
> +		uint16_t ret, num;
> +
> +		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
> +		ret = idpf_singleq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx],
> +						    num);
> +		nb_tx += ret;
> +		nb_pkts -= ret;
> +		if (ret < num)
> +			break;
> +	}
> +
> +	return nb_tx;
> +}
> diff --git a/drivers/common/idpf/version.map b/drivers/common/idpf/version.map
> index 4510aae6b3..eadcb9a2cf 100644
> --- a/drivers/common/idpf/version.map
> +++ b/drivers/common/idpf/version.map
> @@ -15,6 +15,7 @@ INTERNAL {
>  	idpf_dp_splitq_xmit_pkts;
>  	idpf_dp_splitq_xmit_pkts_avx512;
>  	idpf_dp_singleq_recv_pkts_avx2;
> +	idpf_dp_singleq_xmit_pkts_avx2;
>  
>  	idpf_qc_rx_thresh_check;
>  	idpf_qc_rx_queue_release;
> diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c
> index 80c6c325e8..579293b2e8 100644
> --- a/drivers/net/idpf/idpf_rxtx.c
> +++ b/drivers/net/idpf/idpf_rxtx.c
> @@ -888,6 +888,12 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
>  	if (idpf_tx_vec_dev_check_default(dev) == IDPF_VECTOR_PATH &&
>  	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
>  		vport->tx_vec_allowed = true;
> +
> +		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
> +		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&

As with the Rx path, only check the AVX2 flag here.

> +		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
> +			vport->tx_use_avx2 = true;
> +
>  		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
>  #ifdef CC_AVX512_SUPPORT
>  		{
> @@ -947,6 +953,14 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
>  				return;
>  			}
>  #endif /* CC_AVX512_SUPPORT */
> +			if (vport->tx_use_avx2) {
> +				PMD_DRV_LOG(NOTICE,
> +					    "Using Single AVX2 Vector Tx (port %d).",
> +					    dev->data->port_id);
> +				dev->tx_pkt_burst = idpf_dp_singleq_xmit_pkts_avx2;
> +				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
> +				return;
> +			}
>  		}
>  		PMD_DRV_LOG(NOTICE,
>  			    "Using Single Scalar Tx (port %d).",
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx
  2023-12-07  6:35 [PATCH 0/2] enable AVX2 for IDPF single queue Wenzhuo Lu
@ 2023-12-07  6:35 ` Wenzhuo Lu
  0 siblings, 0 replies; 6+ messages in thread
From: Wenzhuo Lu @ 2023-12-07  6:35 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/common/idpf/idpf_common_device.h    |   1 +
 drivers/common/idpf/idpf_common_rxtx.h      |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 609 ++++++++++++++++++++
 drivers/common/idpf/meson.build             |  16 +
 drivers/common/idpf/version.map             |   1 +
 drivers/net/idpf/idpf_rxtx.c                |  12 +
 6 files changed, 643 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

diff --git a/drivers/common/idpf/idpf_common_device.h b/drivers/common/idpf/idpf_common_device.h
index f767ea7cec..afe3d48798 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -114,6 +114,7 @@ struct idpf_vport {
 
 	bool rx_vec_allowed;
 	bool tx_vec_allowed;
+	bool rx_use_avx2;
 	bool rx_use_avx512;
 	bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h b/drivers/common/idpf/idpf_common_rxtx.h
index b49b1ed737..4d64063718 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -302,5 +302,9 @@ uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pk
 __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
+					struct rte_mbuf **rx_pkts,
+					uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c b/drivers/common/idpf/idpf_common_rxtx_avx2.c
new file mode 100644
index 0000000000..0403cf118f
--- /dev/null
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -0,0 +1,609 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <rte_vect.h>
+
+#include "idpf_common_rxtx.h"
+#include "idpf_common_device.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static __rte_always_inline void
+idpf_singleq_rx_rearm(struct idpf_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+	struct rte_mbuf **rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+	rxdp += rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rxq->mp,
+				 (void *)rxep,
+				 IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			__m128i dma_addr0;
+
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxep[i] = &rxq->fake_mbuf;
+				_mm_store_si128((__m128i *)&rxdp[i].read,
+						dma_addr0);
+			}
+		}
+		__atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed,
+				   IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
+
+		return;
+	}
+
+	struct rte_mbuf *mb0, *mb1;
+	__m128i dma_addr0, dma_addr1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+			RTE_PKTMBUF_HEADROOM);
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		__m128i vaddr0, vaddr1;
+
+		mb0 = rxep[0];
+		mb1 = rxep[1];
+
+		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				offsetof(struct rte_mbuf, buf_addr) + 8);
+		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		/* flush desc with pa dma_addr */
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+}
+
+static inline __m256i
+idpf_flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7)
+{
+#define FDID_MIS_MAGIC 0xFFFFFFFF
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2));
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13));
+	const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR |
+			RTE_MBUF_F_RX_FDIR_ID);
+	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
+	const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
+	__m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
+			fdir_mis_mask);
+	/* this XOR op results to bit-reverse the fdir_mask */
+	fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
+	const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+	return fdir_flags;
+}
+
+static inline uint16_t
+_idpf_singleq_recv_raw_pkts_vec_avx2(struct idpf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts, uint8_t *split_packet)
+{
+#define IDPF_DESCS_PER_LOOP_AVX 8
+
+	const uint32_t *ptype_tbl = rxq->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+			0, rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
+
+	rxdp += rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_singleq_rx_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->flex_nic_wb.status_error0 &
+			rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			VIRTCHNL2_RX_FLEX_DESC_STATUS0_EOF_S);
+
+	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+	const __m256i shuf_msk =
+		_mm256_set_epi8
+			(/* first descriptor */
+			 0xFF, 0xFF,
+			 0xFF, 0xFF,	/* rss hash parsed separately */
+			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
+			 5, 4,		/* octet 4~5, 16 bits data_len */
+			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
+			 5, 4,		/* octet 4~5, 16 bits pkt_len */
+			 0xFF, 0xFF,	/* pkt_type set as unknown */
+			 0xFF, 0xFF,	/*pkt_type set as unknown */
+			 /* second descriptor */
+			 0xFF, 0xFF,
+			 0xFF, 0xFF,	/* rss hash parsed separately */
+			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
+			 5, 4,		/* octet 4~5, 16 bits data_len */
+			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
+			 5, 4,		/* octet 4~5, 16 bits pkt_len */
+			 0xFF, 0xFF,	/* pkt_type set as unknown */
+			 0xFF, 0xFF	/*pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except Checksum Reports, RSS indication
+	 * and VLAN indication.
+	 * bit6:4 for IP/L4 checksum errors.
+	 * bit12 is for RSS indication.
+	 * bit13 is for VLAN indication.
+	 */
+	const __m256i flags_mask =
+		 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13));
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 4
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf =
+		_mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
+		 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		  RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		/**
+		 * second 128-bits
+		 * shift right 20 bits to use the low two bits to indicate
+		 * outer checksum status
+		 * shift right 1 bit to make sure it not exceed 255
+		 */
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
+	const __m256i cksum_mask =
+		 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK |
+				   RTE_MBUF_F_RX_L4_CKSUM_MASK |
+				   RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+				   RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK);
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 12.
+	 * If RSS(bit12)/VLAN(bit13) are set,
+	 * shuffle moves appropriate flags in place.
+	 */
+	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_RSS_HASH, 0,
+			/* end up 128-bits */
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+			RTE_MBUF_F_RX_RSS_HASH, 0);
+
+	RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IDPF_DESCS_PER_LOOP_AVX,
+	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
+
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc6_7 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc6),
+				 raw_desc7, 1);
+		raw_desc4_5 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc4),
+				 raw_desc5, 1);
+		raw_desc2_3 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc2),
+				 raw_desc3, 1);
+		raw_desc0_1 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc0),
+				 raw_desc1, 1);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IDPF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
+		__m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
+
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m256i ptype_mask =
+			_mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+		const __m256i ptypes6_7 =
+			_mm256_and_si256(raw_desc6_7, ptype_mask);
+		const __m256i ptypes4_5 =
+			_mm256_and_si256(raw_desc4_5, ptype_mask);
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
+
+		mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype7], 4);
+		mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype6], 0);
+		mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype5], 4);
+		mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype4], 0);
+		/* merge the status bits into one register */
+		const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
+				raw_desc4_5);
+
+		/**
+		 * convert descriptors 0-3 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
+		__m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
+
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m256i ptypes2_3 =
+			_mm256_and_si256(raw_desc2_3, ptype_mask);
+		const __m256i ptypes0_1 =
+			_mm256_and_si256(raw_desc0_1, ptype_mask);
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
+
+		mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype3], 4);
+		mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype2], 0);
+		mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype1], 4);
+		mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype0], 0);
+		/* merge the status bits into one register */
+		const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
+								raw_desc0_1);
+
+		/**
+		 * take the two sets of status bits and merge to one
+		 * After merge, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		__m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
+							  status0_3);
+
+		/* now do flag manipulation */
+
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 4));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+
+		__m256i l4_outer_mask = _mm256_set1_epi32(0x6);
+		__m256i l4_outer_flags =
+				_mm256_and_si256(l3_l4_flags, l4_outer_mask);
+		l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);
+
+		__m256i l3_l4_mask = _mm256_set1_epi32(~0x6);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);
+		l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+		/* set rss and vlan flags */
+		const __m256i rss_vlan_flag_bits =
+			_mm256_srli_epi32(flag_bits, 12);
+		const __m256i rss_vlan_flags =
+			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+					    rss_vlan_flag_bits);
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+				rss_vlan_flags);
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+					   16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_EOF_S);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+					eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IDPF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IDPF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
+{
+	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
+}
diff --git a/drivers/common/idpf/meson.build b/drivers/common/idpf/meson.build
index 80c8906f80..6ab1c8175d 100644
--- a/drivers/common/idpf/meson.build
+++ b/drivers/common/idpf/meson.build
@@ -16,6 +16,22 @@ sources = files(
 )
 
 if arch_subdir == 'x86'
+    # compile AVX2 version if either:
+    # a. we have AVX supported in minimum instruction set baseline
+    # b. it's not minimum instruction set, but supported by compiler
+    if cc.get_define('__AVX2__', args: machine_args) != ''
+        cflags += ['-DCC_AVX2_SUPPORT']
+        sources += files('idpf_common_rxtx_avx2.c')
+    elif cc.has_argument('-mavx2')
+        cflags += ['-DCC_AVX2_SUPPORT']
+        idpf_avx2_lib = static_library('idpf_avx2_lib',
+                'idpf_common_rxtx_avx2.c',
+                dependencies: [static_rte_ethdev, static_rte_kvargs, static_rte_hash],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2'])
+        objs += idpf_avx2_lib.extract_objects('idpf_common_rxtx_avx2.c')
+    endif
+
     idpf_avx512_cpu_support = (
         cc.get_define('__AVX512F__', args: machine_args) != '' and
         cc.get_define('__AVX512BW__', args: machine_args) != '' and
diff --git a/drivers/common/idpf/version.map b/drivers/common/idpf/version.map
index 0729f6b912..4510aae6b3 100644
--- a/drivers/common/idpf/version.map
+++ b/drivers/common/idpf/version.map
@@ -14,6 +14,7 @@ INTERNAL {
 	idpf_dp_splitq_recv_pkts_avx512;
 	idpf_dp_splitq_xmit_pkts;
 	idpf_dp_splitq_xmit_pkts_avx512;
+	idpf_dp_singleq_recv_pkts_avx2;
 
 	idpf_qc_rx_thresh_check;
 	idpf_qc_rx_queue_release;
diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c
index 64f2235580..b155c9ccd1 100644
--- a/drivers/net/idpf/idpf_rxtx.c
+++ b/drivers/net/idpf/idpf_rxtx.c
@@ -772,6 +772,11 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		vport->rx_vec_allowed = true;
 
+		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			vport->rx_use_avx2 = true;
+
 		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
 #ifdef CC_AVX512_SUPPORT
 			if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
@@ -823,6 +828,13 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (vport->rx_use_avx2) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Single AVX2 Vector Rx (port %d).",
+					    dev->data->port_id);
+				dev->rx_pkt_burst = idpf_dp_singleq_recv_pkts_avx2;
+				return;
+			}
 		}
 
 		if (dev->data->scattered_rx) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-01-20 14:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-01-08 12:17 [PATCH 0/2] enable AVX2 for IDPF single queue Shaiq Wani
2025-01-08 12:17 ` [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx Shaiq Wani
2025-01-20 14:15   ` Bruce Richardson
2025-01-08 12:17 ` [PATCH 2/2] common/idpf: enable AVX2 for single queue Tx Shaiq Wani
2025-01-20 14:23   ` Bruce Richardson
  -- strict thread matches above, loose matches on Subject: below --
2023-12-07  6:35 [PATCH 0/2] enable AVX2 for IDPF single queue Wenzhuo Lu
2023-12-07  6:35 ` [PATCH 1/2] common/idpf: enable AVX2 for single queue Rx Wenzhuo Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).