From: Anatoly Burakov <anatoly.burakov@intel.com>
To: dev@dpdk.org, Bruce Richardson <bruce.richardson@intel.com>,
Ian Stokes <ian.stokes@intel.com>
Subject: [PATCH v1 08/13] net/i40e: use common Rx rearm code
Date: Tue, 6 May 2025 14:27:57 +0100 [thread overview]
Message-ID: <abc1333a7b612361e8e1ec77a5205af5dc85b53a.1746538072.git.anatoly.burakov@intel.com> (raw)
In-Reply-To: <c92131e8fcce1901018450bdf97ae004253addf7.1746538072.git.anatoly.burakov@intel.com>
The i40e driver has an implementation of vectorized mbuf rearm code that
is identical to the one in the common code, so just use that.
In addition, the i40e has an implementation of Rx queue rearm for Neon
instruction set, so create a common header for Neon implementations too,
and use that in i40e Neon code.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/net/intel/common/rx_vec_neon.h | 131 +++++++++++
drivers/net/intel/i40e/i40e_rxtx.h | 2 +-
drivers/net/intel/i40e/i40e_rxtx_common_avx.h | 215 ------------------
drivers/net/intel/i40e/i40e_rxtx_vec_avx2.c | 5 +-
drivers/net/intel/i40e/i40e_rxtx_vec_avx512.c | 5 +-
drivers/net/intel/i40e/i40e_rxtx_vec_neon.c | 59 +----
drivers/net/intel/i40e/i40e_rxtx_vec_sse.c | 70 +-----
7 files changed, 144 insertions(+), 343 deletions(-)
create mode 100644 drivers/net/intel/common/rx_vec_neon.h
delete mode 100644 drivers/net/intel/i40e/i40e_rxtx_common_avx.h
diff --git a/drivers/net/intel/common/rx_vec_neon.h b/drivers/net/intel/common/rx_vec_neon.h
new file mode 100644
index 0000000000..35379ab563
--- /dev/null
+++ b/drivers/net/intel/common/rx_vec_neon.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Intel Corporation
+ */
+
+#ifndef _COMMON_INTEL_RX_VEC_NEON_H_
+#define _COMMON_INTEL_RX_VEC_NEON_H_
+
+#include <stdint.h>
+
+#include <ethdev_driver.h>
+#include <rte_io.h>
+#include <rte_vect.h>
+
+#include "rx.h"
+
+static inline int
+_ci_rxq_rearm_get_bufs(struct ci_rx_queue *rxq, const size_t desc_len)
+{
+ struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start];
+ const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
+ volatile void *rxdp;
+ int i;
+
+ rxdp = RTE_PTR_ADD(rxq->rx_ring, rxq->rxrearm_start * desc_len);
+
+ if (rte_mempool_get_bulk(rxq->mp,
+ (void **)rxp,
+ rearm_thresh) < 0) {
+ if (rxq->rxrearm_nb + rearm_thresh >= rxq->nb_rx_desc) {
+ uint64x2_t zero = vdupq_n_u64(0);
+
+ for (i = 0; i < CI_VPMD_DESCS_PER_LOOP; i++) {
+ rxp[i].mbuf = &rxq->fake_mbuf;
+ const void *ptr = RTE_PTR_ADD(rxdp, i * desc_len);
+ vst1q_u64(RTE_CAST_PTR(uint64_t *, ptr), zero);
+ }
+ }
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += rearm_thresh;
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * SSE code path can handle both 16-byte and 32-byte descriptors with one code
+ * path, as we only ever write 16 bytes at a time.
+ */
+static __rte_always_inline void
+_ci_rxq_rearm_neon(struct ci_rx_queue *rxq, const size_t desc_len)
+{
+ const uint64x2_t zero = vdupq_n_u64(0);
+ const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
+ struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start];
+ volatile void *rxdp;
+ int i;
+
+ const uint8x8_t mbuf_init = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
+
+ rxdp = rxq->ixgbe_rx_ring + rxq->rxrearm_start;
+
+ /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+ for (i = 0; i < rearm_thresh; i += 2, rxp += 2, rxdp = RTE_PTR_ADD(rxdp, 2 * desc_len)) {
+ volatile void *ptr0 = RTE_PTR_ADD(rxdp, 0);
+ volatile void *ptr1 = RTE_PTR_ADD(rxdp, desc_len);
+ uint64_t addr0, addr1;
+ uint64x2_t dma_addr0, dma_addr1;
+ struct rte_mbuf *mb0, *mb1;
+
+ mb0 = rxp[0].mbuf;
+ mb1 = rxp[1].mbuf;
+
+#if RTE_IOVA_IN_MBUF
+ /*
+ * Flush mbuf with pkt template.
+ * Data to be rearmed is 6 bytes long.
+ */
+ vst1_u8((uint8_t *)&mb0->rearm_data, mbuf_init);
+ addr0 = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+ dma_addr0 = vsetq_lane_u64(addr0, zero, 0);
+ /* flush desc with pa dma_addr */
+ vst1q_u64(RTE_CAST_PTR(volatile uint64_t *, ptr0), dma_addr0);
+
+ vst1_u8((uint8_t *)&mb1->rearm_data, mbuf_init);
+ addr1 = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
+ dma_addr1 = vsetq_lane_u64(addr1, zero, 0);
+ vst1q_u64(RTE_CAST_PTR(volatile uint64_t *, ptr1), dma_addr1);
+#else
+ /*
+ * Flush mbuf with pkt template.
+ * Data to be rearmed is 6 bytes long.
+ */
+ vst1_u8((uint8_t *)&mb0->rearm_data, mbuf_init);
+ addr0 = (uintptr_t)RTE_PTR_ADD(mb0->buf_addr, RTE_PKTMBUF_HEADROOM);
+ dma_addr0 = vsetq_lane_u64(addr0, zero, 0);
+ /* flush desc with pa dma_addr */
+ vst1q_u64(RTE_CAST_PTR(volatile uint64_t *, ptr0), dma_addr0);
+
+ vst1_u8((uint8_t *)&mb1->rearm_data, mbuf_init);
+ addr1 = (uintptr_t)RTE_PTR_ADD(mb1->buf_addr, RTE_PKTMBUF_HEADROOM);
+ dma_addr1 = vsetq_lane_u64(addr1, zero, 0);
+ vst1q_u64(RTE_CAST_PTR(volatile uint64_t *, ptr1), dma_addr1);
+#endif
+ }
+}
+
+static __rte_always_inline void
+ci_rxq_rearm(struct ci_rx_queue *rxq, const size_t desc_len)
+{
+ const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH;
+ uint16_t rx_id;
+
+ /* Pull 'n' more MBUFs into the software ring */
+ if (_ci_rxq_rearm_get_bufs(rxq, desc_len) < 0)
+ return;
+
+ _ci_rxq_rearm_neon(rxq, desc_len);
+
+ rxq->rxrearm_start += rearm_thresh;
+ if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+ rxq->rxrearm_start = 0;
+
+ rxq->rxrearm_nb -= rearm_thresh;
+
+ rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+ (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+ /* Update the tail pointer on the NIC */
+ rte_write32_wc(rte_cpu_to_le_32(rx_id), rxq->qrx_tail);
+}
+
+#endif /* _COMMON_INTEL_RX_VEC_NEON_H_ */
diff --git a/drivers/net/intel/i40e/i40e_rxtx.h b/drivers/net/intel/i40e/i40e_rxtx.h
index 4b5a84d8ef..8a41db2df3 100644
--- a/drivers/net/intel/i40e/i40e_rxtx.h
+++ b/drivers/net/intel/i40e/i40e_rxtx.h
@@ -13,7 +13,7 @@
#define RTE_I40E_VPMD_RX_BURST 32
#define RTE_I40E_VPMD_TX_BURST 32
-#define RTE_I40E_RXQ_REARM_THRESH 32
+#define RTE_I40E_RXQ_REARM_THRESH CI_VPMD_RX_REARM_THRESH
#define RTE_I40E_MAX_RX_BURST RTE_I40E_RXQ_REARM_THRESH
#define RTE_I40E_TX_MAX_FREE_BUF_SZ 64
#define RTE_I40E_DESCS_PER_LOOP 4
diff --git a/drivers/net/intel/i40e/i40e_rxtx_common_avx.h b/drivers/net/intel/i40e/i40e_rxtx_common_avx.h
deleted file mode 100644
index fd9447014b..0000000000
--- a/drivers/net/intel/i40e/i40e_rxtx_common_avx.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2015 Intel Corporation
- */
-
-#ifndef _I40E_RXTX_COMMON_AVX_H_
-#define _I40E_RXTX_COMMON_AVX_H_
-#include <stdint.h>
-#include <ethdev_driver.h>
-#include <rte_malloc.h>
-
-#include "i40e_ethdev.h"
-#include "i40e_rxtx.h"
-
-#ifdef __AVX2__
-static __rte_always_inline void
-i40e_rxq_rearm_common(struct ci_rx_queue *rxq, __rte_unused bool avx512)
-{
- int i;
- uint16_t rx_id;
- volatile union i40e_rx_desc *rxdp;
- struct ci_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
- rxdp = I40E_RX_RING_PTR(rxq, rxq->rxrearm_start);
-
- /* Pull 'n' more MBUFs into the software ring */
- if (rte_mempool_get_bulk(rxq->mp,
- (void *)rxep,
- RTE_I40E_RXQ_REARM_THRESH) < 0) {
- if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
- rxq->nb_rx_desc) {
- __m128i dma_addr0;
- dma_addr0 = _mm_setzero_si128();
- for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
- rxep[i].mbuf = &rxq->fake_mbuf;
- _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read),
- dma_addr0);
- }
- }
- rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
- RTE_I40E_RXQ_REARM_THRESH;
- return;
- }
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
- struct rte_mbuf *mb0, *mb1;
- __m128i dma_addr0, dma_addr1;
- __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
- RTE_PKTMBUF_HEADROOM);
- /* Initialize the mbufs in vector, process 2 mbufs in one loop */
- for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
- __m128i vaddr0, vaddr1;
-
- mb0 = rxep[0].mbuf;
- mb1 = rxep[1].mbuf;
-
- /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
- RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
- offsetof(struct rte_mbuf, buf_addr) + 8);
- vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
- vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
- /* convert pa to dma_addr hdr/data */
- dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
- dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
- /* add headroom to pa values */
- dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
- dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
- /* flush desc with pa dma_addr */
- _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0);
- _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1);
- }
-#else
-#ifdef __AVX512VL__
- if (avx512) {
- struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
- struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
- __m512i dma_addr0_3, dma_addr4_7;
- __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
- /* Initialize the mbufs in vector, process 8 mbufs in one loop */
- for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
- i += 8, rxep += 8, rxdp += 8) {
- __m128i vaddr0, vaddr1, vaddr2, vaddr3;
- __m128i vaddr4, vaddr5, vaddr6, vaddr7;
- __m256i vaddr0_1, vaddr2_3;
- __m256i vaddr4_5, vaddr6_7;
- __m512i vaddr0_3, vaddr4_7;
-
- mb0 = rxep[0].mbuf;
- mb1 = rxep[1].mbuf;
- mb2 = rxep[2].mbuf;
- mb3 = rxep[3].mbuf;
- mb4 = rxep[4].mbuf;
- mb5 = rxep[5].mbuf;
- mb6 = rxep[6].mbuf;
- mb7 = rxep[7].mbuf;
-
- /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
- RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
- offsetof(struct rte_mbuf, buf_addr) + 8);
- vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
- vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
- vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
- vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
- vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
- vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
- vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
- vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
-
- /**
- * merge 0 & 1, by casting 0 to 256-bit and inserting 1
- * into the high lanes. Similarly for 2 & 3, and so on.
- */
- vaddr0_1 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
- vaddr1, 1);
- vaddr2_3 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
- vaddr3, 1);
- vaddr4_5 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
- vaddr5, 1);
- vaddr6_7 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
- vaddr7, 1);
- vaddr0_3 =
- _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
- vaddr2_3, 1);
- vaddr4_7 =
- _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
- vaddr6_7, 1);
-
- /* convert pa to dma_addr hdr/data */
- dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
- dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
-
- /* add headroom to pa values */
- dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
- dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
-
- /* flush desc with pa dma_addr */
- _mm512_store_si512(RTE_CAST_PTR(__m512i *,
- &rxdp->read), dma_addr0_3);
- _mm512_store_si512(RTE_CAST_PTR(__m512i *,
- &(rxdp + 4)->read), dma_addr4_7);
- }
- } else
-#endif /* __AVX512VL__*/
- {
- struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
- __m256i dma_addr0_1, dma_addr2_3;
- __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
- /* Initialize the mbufs in vector, process 4 mbufs in one loop */
- for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
- i += 4, rxep += 4, rxdp += 4) {
- __m128i vaddr0, vaddr1, vaddr2, vaddr3;
- __m256i vaddr0_1, vaddr2_3;
-
- mb0 = rxep[0].mbuf;
- mb1 = rxep[1].mbuf;
- mb2 = rxep[2].mbuf;
- mb3 = rxep[3].mbuf;
-
- /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
- RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
- offsetof(struct rte_mbuf, buf_addr) + 8);
- vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
- vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
- vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
- vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
- /**
- * merge 0 & 1, by casting 0 to 256-bit and inserting 1
- * into the high lanes. Similarly for 2 & 3
- */
- vaddr0_1 = _mm256_inserti128_si256
- (_mm256_castsi128_si256(vaddr0), vaddr1, 1);
- vaddr2_3 = _mm256_inserti128_si256
- (_mm256_castsi128_si256(vaddr2), vaddr3, 1);
-
- /* convert pa to dma_addr hdr/data */
- dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);
- dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);
-
- /* add headroom to pa values */
- dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);
- dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);
-
- /* flush desc with pa dma_addr */
- _mm256_store_si256(RTE_CAST_PTR(__m256i *,
- &rxdp->read), dma_addr0_1);
- _mm256_store_si256(RTE_CAST_PTR(__m256i *,
- &(rxdp + 2)->read), dma_addr2_3);
- }
- }
-
-#endif
-
- rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
- rx_id = rxq->rxrearm_start - 1;
-
- if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
- rxq->rxrearm_start = 0;
- rx_id = rxq->nb_rx_desc - 1;
- }
-
- rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
-
- /* Update the tail pointer on the NIC */
- I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
-}
-#endif /* __AVX2__*/
-
-#endif /*_I40E_RXTX_COMMON_AVX_H_*/
diff --git a/drivers/net/intel/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/intel/i40e/i40e_rxtx_vec_avx2.c
index 0f3f7430aa..260b7d700a 100644
--- a/drivers/net/intel/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/intel/i40e/i40e_rxtx_vec_avx2.c
@@ -11,14 +11,15 @@
#include "i40e_ethdev.h"
#include "i40e_rxtx.h"
#include "i40e_rxtx_vec_common.h"
-#include "i40e_rxtx_common_avx.h"
+
+#include "../common/rx_vec_sse.h"
#include <rte_vect.h>
static __rte_always_inline void
i40e_rxq_rearm(struct ci_rx_queue *rxq)
{
- i40e_rxq_rearm_common(rxq, false);
+ ci_rxq_rearm(rxq, sizeof(union i40e_rx_desc), CI_RX_VEC_LEVEL_AVX2);
}
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
diff --git a/drivers/net/intel/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/intel/i40e/i40e_rxtx_vec_avx512.c
index f2292b45e8..be004e9f4f 100644
--- a/drivers/net/intel/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/intel/i40e/i40e_rxtx_vec_avx512.c
@@ -11,7 +11,8 @@
#include "i40e_ethdev.h"
#include "i40e_rxtx.h"
#include "i40e_rxtx_vec_common.h"
-#include "i40e_rxtx_common_avx.h"
+
+#include "../common/rx_vec_sse.h"
#include <rte_vect.h>
@@ -20,7 +21,7 @@
static __rte_always_inline void
i40e_rxq_rearm(struct ci_rx_queue *rxq)
{
- i40e_rxq_rearm_common(rxq, true);
+ ci_rxq_rearm(rxq, sizeof(union i40e_rx_desc), CI_RX_VEC_LEVEL_AVX512);
}
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
diff --git a/drivers/net/intel/i40e/i40e_rxtx_vec_neon.c b/drivers/net/intel/i40e/i40e_rxtx_vec_neon.c
index 814aa666dc..6c21546471 100644
--- a/drivers/net/intel/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/intel/i40e/i40e_rxtx_vec_neon.c
@@ -16,65 +16,12 @@
#include "i40e_rxtx.h"
#include "i40e_rxtx_vec_common.h"
+#include "../common/rx_vec_neon.h"
+
static inline void
i40e_rxq_rearm(struct ci_rx_queue *rxq)
{
- int i;
- uint16_t rx_id;
- volatile union i40e_rx_desc *rxdp;
- struct ci_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
- struct rte_mbuf *mb0, *mb1;
- uint64x2_t dma_addr0, dma_addr1;
- uint64x2_t zero = vdupq_n_u64(0);
- uint64_t paddr;
-
- rxdp = I40E_RX_RING_PTR(rxq, rxq->rxrearm_start);
-
- /* Pull 'n' more MBUFs into the software ring */
- if (unlikely(rte_mempool_get_bulk(rxq->mp,
- (void *)rxep,
- RTE_I40E_RXQ_REARM_THRESH) < 0)) {
- if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
- rxq->nb_rx_desc) {
- for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
- rxep[i].mbuf = &rxq->fake_mbuf;
- vst1q_u64(RTE_CAST_PTR(uint64_t *, &rxdp[i].read), zero);
- }
- }
- rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
- RTE_I40E_RXQ_REARM_THRESH;
- return;
- }
-
- /* Initialize the mbufs in vector, process 2 mbufs in one loop */
- for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
- mb0 = rxep[0].mbuf;
- mb1 = rxep[1].mbuf;
-
- paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
- dma_addr0 = vdupq_n_u64(paddr);
-
- /* flush desc with pa dma_addr */
- vst1q_u64(RTE_CAST_PTR(uint64_t *, &rxdp++->read), dma_addr0);
-
- paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
- dma_addr1 = vdupq_n_u64(paddr);
- vst1q_u64(RTE_CAST_PTR(uint64_t *, &rxdp++->read), dma_addr1);
- }
-
- rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
- rx_id = rxq->rxrearm_start - 1;
-
- if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
- rxq->rxrearm_start = 0;
- rx_id = rxq->nb_rx_desc - 1;
- }
-
- rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
-
- rte_io_wmb();
- /* Update the tail pointer on the NIC */
- I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+ ci_rxq_rearm(rxq, sizeof(union i40e_rx_desc));
}
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
diff --git a/drivers/net/intel/i40e/i40e_rxtx_vec_sse.c b/drivers/net/intel/i40e/i40e_rxtx_vec_sse.c
index 74cd59e245..432177d499 100644
--- a/drivers/net/intel/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/intel/i40e/i40e_rxtx_vec_sse.c
@@ -12,78 +12,14 @@
#include "i40e_rxtx.h"
#include "i40e_rxtx_vec_common.h"
+#include "../common/rx_vec_sse.h"
+
#include <rte_vect.h>
static inline void
i40e_rxq_rearm(struct ci_rx_queue *rxq)
{
- int i;
- uint16_t rx_id;
- volatile union i40e_rx_desc *rxdp;
- struct ci_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
- struct rte_mbuf *mb0, *mb1;
- __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
- RTE_PKTMBUF_HEADROOM);
- __m128i dma_addr0, dma_addr1;
-
- rxdp = I40E_RX_RING_PTR(rxq, rxq->rxrearm_start);
-
- /* Pull 'n' more MBUFs into the software ring */
- if (rte_mempool_get_bulk(rxq->mp,
- (void *)rxep,
- RTE_I40E_RXQ_REARM_THRESH) < 0) {
- if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
- rxq->nb_rx_desc) {
- dma_addr0 = _mm_setzero_si128();
- for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
- rxep[i].mbuf = &rxq->fake_mbuf;
- _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read),
- dma_addr0);
- }
- }
- rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
- RTE_I40E_RXQ_REARM_THRESH;
- return;
- }
-
- /* Initialize the mbufs in vector, process 2 mbufs in one loop */
- for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
- __m128i vaddr0, vaddr1;
-
- mb0 = rxep[0].mbuf;
- mb1 = rxep[1].mbuf;
-
- /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
- RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
- offsetof(struct rte_mbuf, buf_addr) + 8);
- vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
- vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
- /* convert pa to dma_addr hdr/data */
- dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
- dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
- /* add headroom to pa values */
- dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
- dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
- /* flush desc with pa dma_addr */
- _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0);
- _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1);
- }
-
- rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
- rx_id = rxq->rxrearm_start - 1;
-
- if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
- rxq->rxrearm_start = 0;
- rx_id = rxq->nb_rx_desc - 1;
- }
-
- rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
-
- /* Update the tail pointer on the NIC */
- I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+ ci_rxq_rearm(rxq, sizeof(union i40e_rx_desc), CI_RX_VEC_LEVEL_SSE);
}
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
--
2.47.1
next prev parent reply other threads:[~2025-05-06 13:29 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-06 13:27 [PATCH v1 01/13] net/ixgbe: remove unused field in Rx queue struct Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 02/13] net/iavf: make IPsec stats dynamically allocated Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 03/13] net/ixgbe: create common Rx queue structure Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 04/13] net/i40e: use the " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 05/13] net/ice: " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 06/13] net/iavf: " Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 07/13] net/intel: generalize vectorized Rx rearm Anatoly Burakov
2025-05-06 13:27 ` Anatoly Burakov [this message]
2025-05-06 13:27 ` [PATCH v1 09/13] net/iavf: use common Rx rearm code Anatoly Burakov
2025-05-06 13:27 ` [PATCH v1 10/13] net/ixgbe: " Anatoly Burakov
2025-05-06 13:28 ` [PATCH v1 11/13] net/intel: support wider x86 vectors for Rx rearm Anatoly Burakov
2025-05-06 13:28 ` [PATCH v1 12/13] net/intel: add common Rx mbuf recycle Anatoly Burakov
2025-05-06 13:28 ` [PATCH v1 13/13] net/intel: add common Tx " Anatoly Burakov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=abc1333a7b612361e8e1ec77a5205af5dc85b53a.1746538072.git.anatoly.burakov@intel.com \
--to=anatoly.burakov@intel.com \
--cc=bruce.richardson@intel.com \
--cc=dev@dpdk.org \
--cc=ian.stokes@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).