DPDK patches and discussions
 help / color / mirror / Atom feed
From: Anatoly Burakov <anatoly.burakov@intel.com>
To: dev@dpdk.org, Bruce Richardson <bruce.richardson@intel.com>
Subject: [PATCH v1 11/13] net/intel: support wider x86 vectors for Rx rearm
Date: Tue,  6 May 2025 14:28:00 +0100	[thread overview]
Message-ID: <79e02f0bf9ce4fbb4b94d27974077bc80e2fbe5d.1746538072.git.anatoly.burakov@intel.com> (raw)
In-Reply-To: <c92131e8fcce1901018450bdf97ae004253addf7.1746538072.git.anatoly.burakov@intel.com>

Currently, for 32-byte descriptor format, only SSE instruction set is
supported. Add implementation for AVX2 and AVX512 instruction sets. This
implementation similarly constant-propagates everything at compile time and
thus should not affect performance of existing code paths. To improve code
readability and reduce code duplication due to supporting different sized
descriptors, the implementation is also refactored.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 drivers/net/intel/common/rx_vec_sse.h | 380 ++++++++++++++------------
 1 file changed, 205 insertions(+), 175 deletions(-)

diff --git a/drivers/net/intel/common/rx_vec_sse.h b/drivers/net/intel/common/rx_vec_sse.h
index 6fe0baf38b..0aeaac3dc9 100644
--- a/drivers/net/intel/common/rx_vec_sse.h
+++ b/drivers/net/intel/common/rx_vec_sse.h
@@ -48,223 +48,258 @@ _ci_rxq_rearm_get_bufs(struct ci_rx_queue *rxq, const size_t desc_len)
 	return 0;
 }
 
-/*
- * SSE code path can handle both 16-byte and 32-byte descriptors with one code
- * path, as we only ever write 16 bytes at a time.
- */
-static __rte_always_inline void
-_ci_rxq_rearm_sse(struct ci_rx_queue *rxq, const size_t desc_len)
+static __rte_always_inline __m128i
+_ci_rxq_rearm_desc_sse(const __m128i vaddr)
 {
 	const __m128i hdr_room = _mm_set1_epi64x(RTE_PKTMBUF_HEADROOM);
 	const __m128i zero = _mm_setzero_si128();
+	__m128i reg;
+
+	/* add headroom to address values */
+	reg = _mm_add_epi64(vaddr, hdr_room);
+
+#if RTE_IOVA_IN_MBUF
+	/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+			 offsetof(struct rte_mbuf, buf_addr) + 8);
+	/* move IOVA to Packet Buffer Address, erase Header Buffer Address */
+	reg = _mm_unpackhi_epi64(reg, zero);
+#else
+	/* erase Header Buffer Address */
+	reg = _mm_unpacklo_epi64(reg, zero);
+#endif
+	return reg;
+}
+
+static __rte_always_inline void
+_ci_rxq_rearm_sse(struct ci_rx_queue *rxq, const size_t desc_len)
+{
 	const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
 	struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start];
+	const uint8_t desc_per_reg = 1;
+	const uint8_t desc_per_iter = desc_per_reg * 2;
 	volatile void *rxdp;
 	int i;
 
 	rxdp = RTE_PTR_ADD(rxq->rx_ring, rxq->rxrearm_start * desc_len);
 
 	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
-	for (i = 0; i < rearm_thresh; i += 2, rxp += 2, rxdp = RTE_PTR_ADD(rxdp, 2 * desc_len)) {
+	for (i = 0; i < rearm_thresh;
+			i += desc_per_iter,
+			rxp += desc_per_iter,
+			rxdp = RTE_PTR_ADD(rxdp, desc_per_iter * desc_len)) {
 		volatile void *ptr0 = RTE_PTR_ADD(rxdp, 0);
-		volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len);
-		__m128i vaddr0, vaddr1;
-		__m128i dma_addr0, dma_addr1;
-		struct rte_mbuf *mb0, *mb1;
+		volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len * desc_per_reg);
+		const struct rte_mbuf *mb0 = rxp[0].mbuf;
+		const struct rte_mbuf *mb1 = rxp[1].mbuf;
 
-		mb0 = rxp[0].mbuf;
-		mb1 = rxp[1].mbuf;
+		const __m128i vaddr0 = _mm_loadu_si128((const __m128i *)&mb0->buf_addr);
+		const __m128i vaddr1 = _mm_loadu_si128((const __m128i *)&mb1->buf_addr);
 
-#if RTE_IOVA_IN_MBUF
-		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-				offsetof(struct rte_mbuf, buf_addr) + 8);
-#endif
-		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-		/* add headroom to address values */
-		vaddr0 = _mm_add_epi64(vaddr0, hdr_room);
-		vaddr1 = _mm_add_epi64(vaddr1, hdr_room);
-
-#if RTE_IOVA_IN_MBUF
-		/* move IOVA to Packet Buffer Address, erase Header Buffer Address */
-		dma_addr0 = _mm_unpackhi_epi64(vaddr0, zero);
-		dma_addr1 = _mm_unpackhi_epi64(vaddr1, zero);
-#else
-		/* erase Header Buffer Address */
-		dma_addr0 = _mm_unpacklo_epi64(vaddr0, zero);
-		dma_addr1 = _mm_unpacklo_epi64(vaddr1, zero);
-#endif
+		const __m128i reg0 = _ci_rxq_rearm_desc_sse(vaddr0);
+		const __m128i reg1 = _ci_rxq_rearm_desc_sse(vaddr1);
 
 		/* flush desc with pa dma_addr */
-		_mm_store_si128(RTE_CAST_PTR(__m128i *, ptr0), dma_addr0);
-		_mm_store_si128(RTE_CAST_PTR(__m128i *, ptr1), dma_addr1);
+		_mm_store_si128(RTE_CAST_PTR(__m128i *, ptr0), reg0);
+		_mm_store_si128(RTE_CAST_PTR(__m128i *, ptr1), reg1);
 	}
 }
 
 #ifdef __AVX2__
-/* AVX2 version for 16-byte descriptors, handles 4 buffers at a time */
-static __rte_always_inline void
-_ci_rxq_rearm_avx2(struct ci_rx_queue *rxq)
+static __rte_always_inline __m256i
+_ci_rxq_rearm_desc_avx2(const __m128i vaddr0, const __m128i vaddr1)
 {
-	struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start];
-	const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
-	const size_t desc_len = 16;
-	volatile void *rxdp;
 	const __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
 	const __m256i zero = _mm256_setzero_si256();
+	__m256i reg;
+
+	/* merge by casting 0 to 256-bit and inserting 1 into the high lanes */
+	reg =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
+					vaddr1, 1);
+
+	/* add headroom to address values */
+	reg = _mm256_add_epi64(reg, hdr_room);
+
+#if RTE_IOVA_IN_MBUF
+	/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+			offsetof(struct rte_mbuf, buf_addr) + 8);
+	/* extract IOVA addr into Packet Buffer Address, erase Header Buffer Address */
+	reg = _mm256_unpackhi_epi64(reg, zero);
+#else
+	/* erase Header Buffer Address */
+	reg = _mm256_unpacklo_epi64(reg, zero);
+#endif
+	return reg;
+}
+
+static __rte_always_inline void
+_ci_rxq_rearm_avx2(struct ci_rx_queue *rxq, const size_t desc_len)
+{
+	struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start];
+	const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
+	/* how many descriptors can fit into a register */
+	const uint8_t desc_per_reg = sizeof(__m256i) / desc_len;
+	/* how many descriptors can fit into one loop iteration */
+	const uint8_t desc_per_iter = desc_per_reg * 2;
+	volatile void *rxdp;
 	int i;
 
 	rxdp = RTE_PTR_ADD(rxq->rx_ring, rxq->rxrearm_start * desc_len);
 
-	/* Initialize the mbufs in vector, process 4 mbufs in one loop */
-	for (i = 0; i < rearm_thresh; i += 4, rxp += 4, rxdp = RTE_PTR_ADD(rxdp, 4 * desc_len)) {
+	/* Initialize the mbufs in vector, process 2 or 4 mbufs in one loop */
+	for (i = 0; i < rearm_thresh;
+			i += desc_per_iter,
+			rxp += desc_per_iter,
+			rxdp = RTE_PTR_ADD(rxdp, desc_per_iter * desc_len)) {
 		volatile void *ptr0 = RTE_PTR_ADD(rxdp, 0);
-		volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len * 2);
-		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
-		__m256i vaddr0_1, vaddr2_3;
-		__m256i dma_addr0_1, dma_addr2_3;
-		struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+		volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len * desc_per_reg);
+		__m256i reg0, reg1;
 
-		mb0 = rxp[0].mbuf;
-		mb1 = rxp[1].mbuf;
-		mb2 = rxp[2].mbuf;
-		mb3 = rxp[3].mbuf;
+		if (desc_per_iter == 2) {
+			/* 16 byte descriptor, 16 byte zero, times two */
+			const __m128i zero = _mm_setzero_si128();
+			const struct rte_mbuf *mb0 = rxp[0].mbuf;
+			const struct rte_mbuf *mb1 = rxp[1].mbuf;
 
-#if RTE_IOVA_IN_MBUF
-		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-				offsetof(struct rte_mbuf, buf_addr) + 8);
-#endif
-		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-		vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-		vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+			const __m128i vaddr0 = _mm_loadu_si128((const __m128i *)&mb0->buf_addr);
+			const __m128i vaddr1 = _mm_loadu_si128((const __m128i *)&mb1->buf_addr);
 
-		/**
-		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
-		 * into the high lanes. Similarly for 2 & 3
-		 */
-		vaddr0_1 =
-			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-						vaddr1, 1);
-		vaddr2_3 =
-			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
-						vaddr3, 1);
+			reg0 = _ci_rxq_rearm_desc_avx2(vaddr0, zero);
+			reg1 = _ci_rxq_rearm_desc_avx2(vaddr1, zero);
+		} else {
+			/* 16 byte descriptor times four */
+			const struct rte_mbuf *mb0 = rxp[0].mbuf;
+			const struct rte_mbuf *mb1 = rxp[1].mbuf;
+			const struct rte_mbuf *mb2 = rxp[2].mbuf;
+			const struct rte_mbuf *mb3 = rxp[3].mbuf;
 
-		/* add headroom to address values */
-		vaddr0_1 = _mm256_add_epi64(vaddr0_1, hdr_room);
-		vaddr0_1 = _mm256_add_epi64(vaddr0_1, hdr_room);
+			const __m128i vaddr0 = _mm_loadu_si128((const __m128i *)&mb0->buf_addr);
+			const __m128i vaddr1 = _mm_loadu_si128((const __m128i *)&mb1->buf_addr);
+			const __m128i vaddr2 = _mm_loadu_si128((const __m128i *)&mb2->buf_addr);
+			const __m128i vaddr3 = _mm_loadu_si128((const __m128i *)&mb3->buf_addr);
 
-#if RTE_IOVA_IN_MBUF
-		/* extract IOVA addr into Packet Buffer Address, erase Header Buffer Address */
-		dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, zero);
-		dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, zero);
-#else
-		/* erase Header Buffer Address */
-		dma_addr0_1 = _mm256_unpacklo_epi64(vaddr0_1, zero);
-		dma_addr2_3 = _mm256_unpacklo_epi64(vaddr2_3, zero);
-#endif
+			reg0 = _ci_rxq_rearm_desc_avx2(vaddr0, vaddr1);
+			reg1 = _ci_rxq_rearm_desc_avx2(vaddr2, vaddr3);
+		}
 
 		/* flush desc with pa dma_addr */
-		_mm256_store_si256(RTE_CAST_PTR(__m256i *, ptr0), dma_addr0_1);
-		_mm256_store_si256(RTE_CAST_PTR(__m256i *, ptr1), dma_addr2_3);
+		_mm256_store_si256(RTE_CAST_PTR(__m256i *, ptr0), reg0);
+		_mm256_store_si256(RTE_CAST_PTR(__m256i *, ptr1), reg1);
 	}
 }
 #endif /* __AVX2__ */
 
 #ifdef __AVX512VL__
-/* AVX512 version for 16-byte descriptors, handles 8 buffers at a time */
+static __rte_always_inline __m512i
+_ci_rxq_rearm_desc_avx512(const __m128i vaddr0, const __m128i vaddr1,
+		const __m128i vaddr2, const __m128i vaddr3)
+{
+	const __m512i zero = _mm512_setzero_si512();
+	const __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+	__m256i vaddr0_1, vaddr2_3;
+	__m512i reg;
+
+	/**
+	 * merge 0 & 1, by casting 0 to 256-bit and inserting 1 into the high
+	 * lanes. Similarly for 2 & 3.
+	 */
+	vaddr0_1 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
+					vaddr1, 1);
+	vaddr2_3 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
+					vaddr3, 1);
+	/*
+	 * merge 0+1 & 2+3, by casting 0+1 to 512-bit and inserting 2+3 into the
+	 * high lanes.
+	 */
+	reg =
+		_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
+					vaddr2_3, 1);
+
+	/* add headroom to address values */
+	reg = _mm512_add_epi64(reg, hdr_room);
+
+#if RTE_IOVA_IN_MBUF
+	/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+			offsetof(struct rte_mbuf, buf_addr) + 8);
+	/* extract IOVA addr into Packet Buffer Address, erase Header Buffer Address */
+	reg = _mm512_unpackhi_epi64(reg, zero);
+#else
+	/* erase Header Buffer Address */
+	reg = _mm512_unpacklo_epi64(reg, zero);
+#endif
+	return reg;
+}
+
 static __rte_always_inline void
-_ci_rxq_rearm_avx512(struct ci_rx_queue *rxq)
+_ci_rxq_rearm_avx512(struct ci_rx_queue *rxq, const size_t desc_len)
 {
 	struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start];
 	const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
-	const size_t desc_len = 16;
+	/* how many descriptors can fit into a register */
+	const uint8_t desc_per_reg = sizeof(__m512i) / desc_len;
+	/* how many descriptors can fit into one loop iteration */
+	const uint8_t desc_per_iter = desc_per_reg * 2;
 	volatile void *rxdp;
 	int i;
-	struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-	struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
-	__m512i dma_addr0_3, dma_addr4_7;
-	__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
-	__m512i zero = _mm512_setzero_si512();
 
 	rxdp = RTE_PTR_ADD(rxq->rx_ring, rxq->rxrearm_start * desc_len);
 
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < rearm_thresh; i += 8, rxp += 8, rxdp = RTE_PTR_ADD(rxdp, 8 * desc_len)) {
+	/* Initialize the mbufs in vector, process 4 or 8 mbufs in one loop */
+	for (i = 0; i < rearm_thresh;
+			i += desc_per_iter,
+			rxp += desc_per_iter,
+			rxdp = RTE_PTR_ADD(rxdp, desc_per_iter * desc_len)) {
 		volatile void *ptr0 = RTE_PTR_ADD(rxdp, 0);
-		volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len * 4);
-		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
-		__m128i vaddr4, vaddr5, vaddr6, vaddr7;
-		__m256i vaddr0_1, vaddr2_3;
-		__m256i vaddr4_5, vaddr6_7;
-		__m512i vaddr0_3, vaddr4_7;
+		volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len * desc_per_reg);
+		__m512i reg0, reg1;
 
-		mb0 = rxp[0].mbuf;
-		mb1 = rxp[1].mbuf;
-		mb2 = rxp[2].mbuf;
-		mb3 = rxp[3].mbuf;
-		mb4 = rxp[4].mbuf;
-		mb5 = rxp[5].mbuf;
-		mb6 = rxp[6].mbuf;
-		mb7 = rxp[7].mbuf;
+		if (desc_per_iter == 4) {
+			/* 16-byte descriptor, 16 byte zero, times four */
+			const __m128i zero = _mm_setzero_si128();
+			const struct rte_mbuf *mb0 = rxp[0].mbuf;
+			const struct rte_mbuf *mb1 = rxp[1].mbuf;
+			const struct rte_mbuf *mb2 = rxp[2].mbuf;
+			const struct rte_mbuf *mb3 = rxp[3].mbuf;
 
-#if RTE_IOVA_IN_MBUF
-		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-				offsetof(struct rte_mbuf, buf_addr) + 8);
-#endif
-		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-		vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-		vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-		vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
-		vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
-		vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
-		vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
+			const __m128i vaddr0 = _mm_loadu_si128((const __m128i *)&mb0->buf_addr);
+			const __m128i vaddr1 = _mm_loadu_si128((const __m128i *)&mb1->buf_addr);
+			const __m128i vaddr2 = _mm_loadu_si128((const __m128i *)&mb2->buf_addr);
+			const __m128i vaddr3 = _mm_loadu_si128((const __m128i *)&mb3->buf_addr);
 
-		/**
-		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
-		 * into the high lanes. Similarly for 2 & 3, and so on.
-		 */
-		vaddr0_1 =
-			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-						vaddr1, 1);
-		vaddr2_3 =
-			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
-						vaddr3, 1);
-		vaddr4_5 =
-			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
-						vaddr5, 1);
-		vaddr6_7 =
-			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
-						vaddr7, 1);
-		vaddr0_3 =
-			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
-						vaddr2_3, 1);
-		vaddr4_7 =
-			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
-						vaddr6_7, 1);
+			reg0 = _ci_rxq_rearm_desc_avx512(vaddr0, zero, vaddr1, zero);
+			reg1 = _ci_rxq_rearm_desc_avx512(vaddr2, zero, vaddr3, zero);
+		} else {
+			/* 16-byte descriptor times eight */
+			const struct rte_mbuf *mb0 = rxp[0].mbuf;
+			const struct rte_mbuf *mb1 = rxp[1].mbuf;
+			const struct rte_mbuf *mb2 = rxp[2].mbuf;
+			const struct rte_mbuf *mb3 = rxp[3].mbuf;
+			const struct rte_mbuf *mb4 = rxp[4].mbuf;
+			const struct rte_mbuf *mb5 = rxp[5].mbuf;
+			const struct rte_mbuf *mb6 = rxp[6].mbuf;
+			const struct rte_mbuf *mb7 = rxp[7].mbuf;
 
-		/* add headroom to address values */
-		vaddr0_3 = _mm512_add_epi64(vaddr0_3, hdr_room);
-		dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
+			const __m128i vaddr0 = _mm_loadu_si128((const __m128i *)&mb0->buf_addr);
+			const __m128i vaddr1 = _mm_loadu_si128((const __m128i *)&mb1->buf_addr);
+			const __m128i vaddr2 = _mm_loadu_si128((const __m128i *)&mb2->buf_addr);
+			const __m128i vaddr3 = _mm_loadu_si128((const __m128i *)&mb3->buf_addr);
+			const __m128i vaddr4 = _mm_loadu_si128((const __m128i *)&mb4->buf_addr);
+			const __m128i vaddr5 = _mm_loadu_si128((const __m128i *)&mb5->buf_addr);
+			const __m128i vaddr6 = _mm_loadu_si128((const __m128i *)&mb6->buf_addr);
+			const __m128i vaddr7 = _mm_loadu_si128((const __m128i *)&mb7->buf_addr);
 
-#if RTE_IOVA_IN_MBUF
-		/* extract IOVA addr into Packet Buffer Address, erase Header Buffer Address */
-		dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, zero);
-		dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, zero);
-#else
-		/* erase Header Buffer Address */
-		dma_addr0_3 = _mm512_unpacklo_epi64(vaddr0_3, zero);
-		dma_addr4_7 = _mm512_unpacklo_epi64(vaddr4_7, zero);
-#endif
+			reg0 = _ci_rxq_rearm_desc_avx512(vaddr0, vaddr1, vaddr2, vaddr3);
+			reg1 = _ci_rxq_rearm_desc_avx512(vaddr4, vaddr5, vaddr6, vaddr7);
+		}
 
 		/* flush desc with pa dma_addr */
-		_mm512_store_si512(RTE_CAST_PTR(__m512i *, ptr0), dma_addr0_3);
-		_mm512_store_si512(RTE_CAST_PTR(__m512i *, ptr1), dma_addr4_7);
+		_mm512_store_si512(RTE_CAST_PTR(__m512i *, ptr0), reg0);
+		_mm512_store_si512(RTE_CAST_PTR(__m512i *, ptr1), reg1);
 	}
 }
 #endif /* __AVX512VL__ */
@@ -280,31 +315,26 @@ ci_rxq_rearm(struct ci_rx_queue *rxq, const size_t desc_len,
 	if (_ci_rxq_rearm_get_bufs(rxq, desc_len) < 0)
 		return;
 
-	if (desc_len == 16) {
-		switch (vec_level) {
-		case CI_RX_VEC_LEVEL_AVX512:
+	switch (vec_level) {
+	case CI_RX_VEC_LEVEL_AVX512:
 #ifdef __AVX512VL__
-			_ci_rxq_rearm_avx512(rxq);
-			break;
+		_ci_rxq_rearm_avx512(rxq, desc_len);
+		break;
 #else
-			/* fall back to AVX2 unless requested not to */
-			/* fall through */
+		/* fall back to AVX2 unless requested not to */
+		/* fall through */
 #endif
-		case CI_RX_VEC_LEVEL_AVX2:
+	case CI_RX_VEC_LEVEL_AVX2:
 #ifdef __AVX2__
-			_ci_rxq_rearm_avx2(rxq);
+			_ci_rxq_rearm_avx2(rxq, desc_len);
 			break;
 #else
 			/* fall back to SSE if AVX2 isn't supported */
 			/* fall through */
 #endif
-		case CI_RX_VEC_LEVEL_SSE:
-			_ci_rxq_rearm_sse(rxq, desc_len);
-			break;
-		}
-	} else {
-		/* for 32-byte descriptors only support SSE */
+	case CI_RX_VEC_LEVEL_SSE:
 		_ci_rxq_rearm_sse(rxq, desc_len);
+		break;
 	}
 
 	rxq->rxrearm_start += rearm_thresh;
-- 
2.47.1


  parent reply	other threads:[~2025-05-06 13:29 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-06 13:27 [PATCH v1 01/13] net/ixgbe: remove unused field in Rx queue struct Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 02/13] net/iavf: make IPsec stats dynamically allocated Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 03/13] net/ixgbe: create common Rx queue structure Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 04/13] net/i40e: use the " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 05/13] net/ice: " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 06/13] net/iavf: " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 07/13] net/intel: generalize vectorized Rx rearm Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 08/13] net/i40e: use common Rx rearm code Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 09/13] net/iavf: " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 10/13] net/ixgbe: " Anatoly Burakov
2025-05-06 13:28 ` Anatoly Burakov [this message]
2025-05-06 13:28 ` [PATCH v1 12/13] net/intel: add common Rx mbuf recycle Anatoly Burakov
2025-05-06 13:28 ` [PATCH v1 13/13] net/intel: add common Tx " Anatoly Burakov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=79e02f0bf9ce4fbb4b94d27974077bc80e2fbe5d.1746538072.git.anatoly.burakov@intel.com \
    --to=anatoly.burakov@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).