* [PATCH v2 1/4] net/nfp: export more interfaces of NFDk
2024-07-08 5:58 ` [PATCH v2 " Chaoyong He
@ 2024-07-08 5:58 ` Chaoyong He
2024-07-08 5:58 ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
` (4 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08 5:58 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfdk/nfp_nfdk.h | 45 ++++++++++++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
2 files changed, 46 insertions(+), 42 deletions(-)
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
#ifndef __NFP_NFDK_H__
#define __NFP_NFDK_H__
+#include <nfp_platform.h>
+
#include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
#define NFDK_TX_DESC_PER_SIMPLE_PKT 2
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
}
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+ /* First descriptor fits less data, so adjust for that */
+ return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+ NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+ struct rte_mbuf *mb,
+ uint64_t flags)
+{
+ uint64_t ol_flags;
+ struct nfp_net_hw *hw = txq->hw;
+
+ if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+ return flags;
+
+ ol_flags = mb->ol_flags;
+
+ /* Set TCP csum offload if TSO enabled. */
+ if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+ flags |= NFDK_DESC_TX_L4_CSUM;
+
+ if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+ flags |= NFDK_DESC_TX_ENCAP;
+
+ /* IPv6 does not need checksum */
+ if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+ flags |= NFDK_DESC_TX_L3_CSUM;
+
+ if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+ flags |= NFDK_DESC_TX_L4_CSUM;
+
+ return flags;
+}
+
uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
uint32_t port_id);
uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
const struct rte_eth_txconf *tx_conf);
int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+ struct nfp_net_txq *txq,
+ uint64_t *metadata);
#endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 8b8c191b82..173aabf0b9 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
#include "nfp_nfdk.h"
#include <bus_pci_driver.h>
-#include <nfp_platform.h>
#include <rte_malloc.h>
#include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
#define NFDK_TX_DESC_GATHER_MAX 17
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
- struct rte_mbuf *mb,
- uint64_t flags)
-{
- uint64_t ol_flags;
- struct nfp_net_hw *hw = txq->hw;
-
- if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
- return flags;
-
- ol_flags = mb->ol_flags;
-
- /* Set L4 csum offload if TSO/UFO enabled. */
- if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
- (ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
- flags |= NFDK_DESC_TX_L4_CSUM;
-
- if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
- flags |= NFDK_DESC_TX_ENCAP;
-
- /* IPv6 does not need checksum */
- if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
- flags |= NFDK_DESC_TX_L3_CSUM;
-
- if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
- flags |= NFDK_DESC_TX_L4_CSUM;
-
- return flags;
-}
-
/* Set TX descriptor for TSO of nfdk */
static uint64_t
nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
return FLOWER_PKT_DATA_OFFSET;
}
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
- /* First descriptor fits less data, so adjust for that */
- return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
- NFDK_TX_MAX_DATA_PER_DESC);
-}
-
static inline void
nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
return nop_slots;
}
-static int
+int
nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
struct nfp_net_txq *txq,
uint64_t *metadata)
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v2 2/4] net/nfp: support AVX2 Tx function
2024-07-08 5:58 ` [PATCH v2 " Chaoyong He
2024-07-08 5:58 ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-07-08 5:58 ` Chaoyong He
2024-07-08 5:58 ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
` (3 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08 5:58 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/meson.build | 15 +
drivers/net/nfp/nfdk/nfp_nfdk.h | 1 +
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 12 +
drivers/net/nfp/nfdk/nfp_nfdk_vec.h | 36 ++
drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c | 14 +
drivers/net/nfp/nfp_ethdev.c | 3 +-
drivers/net/nfp/nfp_ethdev_vf.c | 3 +-
drivers/net/nfp/nfp_rxtx.h | 5 +-
drivers/net/nfp/nfp_rxtx_vec.h | 13 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 21 +
drivers/net/nfp/nfp_rxtx_vec_stub.c | 16 +
12 files changed, 568 insertions(+), 3 deletions(-)
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c
diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index b7e5beffb0..39bda04bc5 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
'flower/nfp_flower_service.c',
'nfd3/nfp_nfd3_dp.c',
'nfdk/nfp_nfdk_dp.c',
+ 'nfdk/nfp_nfdk_vec_stub.c',
'nfpcore/nfp_cppcore.c',
'nfpcore/nfp_crc.c',
'nfpcore/nfp_elf.c',
@@ -43,8 +44,22 @@ sources = files(
'nfp_net_flow.c',
'nfp_net_meta.c',
'nfp_rxtx.c',
+ 'nfp_rxtx_vec_stub.c',
'nfp_service.c',
'nfp_trace.c',
)
+if arch_subdir == 'x86'
+ avx2_sources = files(
+ 'nfdk/nfp_nfdk_vec_avx2_dp.c',
+ 'nfp_rxtx_vec_avx2.c',
+ )
+ nfp_avx2_lib = static_library('nfp_avx2_lib',
+ avx2_sources,
+ dependencies: [static_rte_ethdev, static_rte_bus_pci,
+ static_rte_common_nfp],
+ c_args: [cflags, '-mavx2'])
+ objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
struct nfp_net_txq *txq,
uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
#endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
#include "../flower/nfp_flower.h"
#include "../nfp_logs.h"
#include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
#define NFDK_TX_DESC_GATHER_MAX 17
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
dev->data->tx_queues[queue_idx] = txq;
txq->hw = hw;
txq->hw_priv = dev->process_private;
+ txq->simple_always = true;
/*
* Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
return 0;
}
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+ if (nfp_net_get_avx2_supported())
+ eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+ else
+ eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+ struct nfp_net_hw *hw)
+{
+ if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+ return false;
+
+ if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+ return true;
+
+ if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+ return true;
+
+ return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+ (NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+ struct nfp_net_txq *txq,
+ uint64_t *des_addr,
+ uint64_t *des_meta,
+ bool repr_flag)
+{
+ int ret;
+ __m128i dma_addr;
+ __m128i dma_hi;
+ __m128i data_off;
+ __m128i dlen_type;
+ uint64_t metadata;
+
+ if (repr_flag) {
+ metadata = NFDK_DESC_TX_CHAIN_META;
+ } else {
+ ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+ if (unlikely(ret != 0))
+ return ret;
+ }
+
+ data_off = _mm_set_epi64x(0, pkt->data_off);
+ dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+ dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+ dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+ *des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+ _mm_slli_epi64(dlen_type, 16)), 0);
+
+ *des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+ return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+ struct nfp_net_nfdk_tx_desc *txds,
+ struct rte_mbuf *pkt,
+ bool repr_flag)
+{
+ int ret;
+ __m128i des_data;
+ uint64_t des_addr;
+ uint64_t des_meta;
+
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+ &des_meta, repr_flag);
+ if (unlikely(ret != 0))
+ return ret;
+
+ txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+ if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+ txq->data_pending += pkt->data_len;
+ else
+ txq->data_pending = 0;
+
+ des_data = _mm_set_epi64x(des_meta, des_addr);
+
+ _mm_store_si128((void *)txds, des_data);
+
+ return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+ struct nfp_net_nfdk_tx_desc *txds,
+ struct rte_mbuf **pkt,
+ bool repr_flag)
+{
+ int ret;
+ uint16_t i;
+ __m256i des_data0_1;
+ __m256i des_data2_3;
+ uint64_t des_addr[4];
+ uint64_t des_meta[4];
+
+ for (i = 0; i < 4; i++) {
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+ &des_addr[i], &des_meta[i], repr_flag);
+ if (unlikely(ret != 0))
+ return ret;
+ }
+
+ for (i = 0; i < 4; i++) {
+ txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+ if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+ txq->data_pending += pkt[i]->data_len;
+ else
+ txq->data_pending = 0;
+ }
+
+ des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+ des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+ _mm256_store_si256((void *)txds, des_data0_1);
+ _mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+ return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+ struct rte_mbuf **tx_pkts)
+{
+ __m256i mbuf_room0_1;
+ __m256i mbuf_room2_3;
+
+ mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+ (uintptr_t)tx_pkts[0]);
+ mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+ (uintptr_t)tx_pkts[2]);
+
+ _mm256_store_si256((void *)mbuf, mbuf_room0_1);
+ _mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts,
+ uint16_t simple_close,
+ bool repr_flag)
+{
+ int ret;
+ uint16_t npkts = 0;
+ uint16_t need_txds;
+ uint16_t free_descs;
+ struct rte_mbuf **lmbuf;
+ struct nfp_net_nfdk_tx_desc *ktxds;
+
+ PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+ txq->qidx, txq->wr_p, nb_pkts);
+
+ need_txds = nb_pkts << 1;
+ if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+ nfp_net_tx_free_bufs(txq);
+
+ free_descs = nfp_net_nfdk_free_tx_desc(txq);
+ if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+ if (unlikely(simple_close > 0))
+ goto xmit_end;
+
+ return 0;
+ }
+
+ PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+ /* Sending packets */
+ while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+ ktxds = &txq->ktxds[txq->wr_p];
+ lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+ /*
+ * If can not send burst, just send one.
+ * 1. Tx ring will come to the tail.
+ * 2. Do not need to send 4 packets.
+ * 3. If pointer address unaligned on 32-bit boundary.
+ * 4. If free descriptors are not enough.
+ */
+ if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+ (nb_pkts - npkts) < 4 ||
+ ((uintptr_t)ktxds & 0x1F) != 0 ||
+ free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+ ktxds, tx_pkts[npkts], repr_flag);
+ if (unlikely(ret != 0))
+ goto xmit_end;
+
+ rte_pktmbuf_free(*lmbuf);
+
+ _mm_storel_epi64((void *)lmbuf,
+ _mm_loadu_si128((void *)&tx_pkts[npkts]));
+ npkts++;
+ free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+ continue;
+ }
+
+ ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+ &tx_pkts[npkts], repr_flag);
+ if (unlikely(ret != 0))
+ goto xmit_end;
+
+ rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+ nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+ npkts += 4;
+ free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+ }
+
+xmit_end:
+ /* Increment write pointers. Force memory write before we let HW know */
+ rte_wmb();
+ nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+ return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+ uint16_t *simple_close)
+{
+ uint16_t i;
+ uint16_t wr_p;
+ uint16_t nop_slots;
+ __m128i zero_128 = _mm_setzero_si128();
+ __m256i zero_256 = _mm256_setzero_si256();
+
+ wr_p = txq->wr_p;
+ nop_slots = D_BLOCK_CPL(wr_p);
+
+ for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+ _mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+ rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+ _mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+ }
+
+ for (; i >= 2; i -= 2, wr_p += 2) {
+ _mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+ _mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+ }
+
+ for (; i >= 1; i--, wr_p++) {
+ _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+ _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+ }
+
+ txq->data_pending = 0;
+ txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+ (*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+ uint16_t *simple_close)
+{
+ uint16_t wr_p;
+ __m128i zero_128 = _mm_setzero_si128();
+
+ wr_p = txq->wr_p;
+
+ _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+ _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+ txq->wr_p = D_IDX(txq, wr_p + 1);
+ (*simple_close)++;
+
+ return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+ struct rte_mbuf *pkt,
+ bool *simple_flag,
+ bool *pending_flag,
+ uint16_t *data_pending,
+ uint32_t *wr_p,
+ uint16_t *simple_close)
+{
+ uint32_t data_pending_temp;
+
+ /* Let the first descriptor index even before send simple packets */
+ if (!(*simple_flag)) {
+ if ((*wr_p & 0x1) == 0x1)
+ *wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+ *simple_flag = true;
+ }
+
+ /* Simple packets only need one close block operation */
+ if (!(*pending_flag)) {
+ if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+ *pending_flag = true;
+ return;
+ }
+
+ data_pending_temp = *data_pending + pkt->data_len;
+ if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+ nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+ *pending_flag = true;
+ return;
+ }
+
+ *data_pending = data_pending_temp;
+
+ *wr_p += 2;
+ }
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t head,
+ uint16_t nb_pkts,
+ uint16_t *simple_close)
+{
+ uint32_t wr_p;
+ uint16_t simple_idx;
+ struct rte_mbuf *pkt;
+ uint16_t data_pending;
+ bool simple_flag = false;
+ bool pending_flag = false;
+ uint16_t simple_count = 0;
+
+ *simple_close = 0;
+ wr_p = txq->wr_p;
+ data_pending = txq->data_pending;
+
+ for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+ pkt = tx_pkts[simple_idx];
+ if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+ break;
+
+ simple_count++;
+ if (!txq->simple_always)
+ nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+ &pending_flag, &data_pending, &wr_p, simple_close);
+ }
+
+ return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t head,
+ uint16_t nb_pkts)
+{
+ uint16_t others_idx;
+ struct rte_mbuf *pkt;
+ uint16_t others_count = 0;
+
+ for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+ pkt = tx_pkts[others_idx];
+ if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+ break;
+
+ others_count++;
+ }
+
+ return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t i;
+ uint16_t avail = 0;
+ uint16_t simple_close;
+ uint16_t simple_count;
+ uint16_t simple_avail;
+ uint16_t others_count;
+ uint16_t others_avail;
+ struct nfp_net_txq *txq = tx_queue;
+
+ for (i = 0; i < nb_pkts; i++) {
+ simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+ nb_pkts, &simple_close);
+ if (simple_count > 0) {
+ if (!txq->simple_always)
+ txq->simple_always = true;
+
+ simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+ tx_pkts + i, simple_count, simple_close,
+ false);
+
+ avail += simple_avail;
+ if (simple_avail != simple_count)
+ break;
+
+ i += simple_count;
+ }
+
+ if (i == nb_pkts)
+ break;
+
+ others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+ i, nb_pkts);
+
+ if (txq->simple_always)
+ txq->simple_always = false;
+
+ others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+ tx_pkts + i, others_count, false);
+
+ avail += others_avail;
+ if (others_avail != others_count)
+ break;
+
+ i += others_count;
+ }
+
+ return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+ __rte_unused struct rte_mbuf **tx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
#include "nfp_ipsec.h"
#include "nfp_logs.h"
#include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
/* 64-bit per app capabilities */
#define NFP_NET_APP_CAP_SP_INDIFF RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
else
- eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+ nfp_net_nfdk_xmit_pkts_set(eth_dev);
eth_dev->dev_ops = &nfp_net_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
#include "nfp_logs.h"
#include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
#define NFP_VF_DRIVER_NAME net_nfp_vf
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
else
- eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+ nfp_net_nfdk_xmit_pkts_set(eth_dev);
eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
/** Used by NFDk only */
uint16_t data_pending;
+ /** Used by NFDk vector xmit only */
+ bool simple_always;
+
/**
* At this point 58 bytes have been used for all the fields in the
- * TX critical path. We have room for 6 bytes and still all placed
+ * TX critical path. We have room for 5 bytes and still all placed
* in a cache line.
*/
uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+ if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+ rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+ return true;
+
+ return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+ return false;
+}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v2 3/4] net/nfp: support AVX2 Rx function
2024-07-08 5:58 ` [PATCH v2 " Chaoyong He
2024-07-08 5:58 ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-08 5:58 ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-07-08 5:58 ` Chaoyong He
2024-07-08 5:58 ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
` (2 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08 5:58 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.
Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfp_ethdev.c | 2 +-
drivers/net/nfp/nfp_ethdev_vf.c | 2 +-
drivers/net/nfp/nfp_net_meta.c | 1 +
drivers/net/nfp/nfp_rxtx.c | 10 ++
drivers/net/nfp/nfp_rxtx.h | 1 +
drivers/net/nfp/nfp_rxtx_vec.h | 4 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
drivers/net/nfp/nfp_rxtx_vec_stub.c | 9 +
8 files changed, 279 insertions(+), 2 deletions(-)
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
eth_dev->dev_ops = &nfp_net_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
- eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+ nfp_net_recv_pkts_set(eth_dev);
}
static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
- eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+ nfp_net_recv_pkts_set(eth_dev);
}
static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
rte_be32_t meta_header,
struct nfp_net_meta_parsed *meta)
{
+ meta->flags = 0;
meta->flags |= (1 << NFP_NET_META_HASH);
meta->hash_type = rte_be_to_cpu_32(meta_header);
meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
#include "nfp_ipsec.h"
#include "nfp_logs.h"
#include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
/*
* The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
info->conf.offloads = dev_info.tx_offload_capa &
dev->data->dev_conf.txmode.offloads;
}
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+ if (nfp_net_get_avx2_supported())
+ eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+ else
+ eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
uint16_t queue_id,
struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
#endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
bool nfp_net_get_avx2_supported(void);
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
+
#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
#include <stdbool.h>
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
#include <rte_cpuflags.h>
#include <rte_vect.h>
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
#include "nfp_rxtx_vec.h"
bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
return false;
}
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rxb)
+{
+ __m128i dma;
+ __m128i dma_hi;
+ __m128i vaddr0;
+ __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ _mm_storel_epi64((void *)rxds, vaddr0);
+
+ rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf **rxb)
+{
+ __m128i dma;
+ __m128i dma_hi;
+ __m128i vaddr0;
+ __m128i vaddr1;
+ __m128i vaddr2;
+ __m128i vaddr3;
+ __m128i vaddr0_1;
+ __m128i vaddr2_3;
+ __m256i vaddr0_3;
+ __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+ vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+ vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+ vaddr2_3, 1);
+
+ _mm256_store_si256((void *)rxds, vaddr0_3);
+
+ rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rx_pkt)
+{
+ struct nfp_net_hw *hw = rxq->hw;
+ struct nfp_net_meta_parsed meta;
+
+ rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+ /* Size of the whole packet. We just support 1 segment */
+ rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+ /* Filling the received mbuf with packet info */
+ if (hw->rx_offset)
+ rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+ else
+ rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+ rx_pkt->port = rxq->port_id;
+ rx_pkt->nb_segs = 1;
+ rx_pkt->next = NULL;
+
+ nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+ /* Checking the checksum flag */
+ nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rxb,
+ struct rte_mbuf *rx_pkt)
+{
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+ nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf **rxb,
+ struct rte_mbuf **rx_pkts)
+{
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+ nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+ __m256i data = _mm256_loadu_si256((void *)rxds);
+
+ if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+ return false;
+
+ return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t avail;
+ uint16_t nb_hold;
+ bool burst_receive;
+ struct rte_mbuf **rxb;
+ struct nfp_net_rx_desc *rxds;
+ struct nfp_net_rxq *rxq = rx_queue;
+
+ if (unlikely(rxq == NULL)) {
+ PMD_RX_LOG(ERR, "RX Bad queue");
+ return 0;
+ }
+
+ avail = 0;
+ nb_hold = 0;
+ burst_receive = true;
+ while (avail < nb_pkts) {
+ rxds = &rxq->rxds[rxq->rd_p];
+ rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+ if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+ & PCIE_DESC_RX_DD) == 0)
+ goto recv_end;
+
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+ if ((rxq->rd_p & 0x3) == 0) {
+ rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+ rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+ }
+
+ if ((rxq->rd_p & 0x7) == 0) {
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+ }
+
+ /*
+ * If can not receive burst, just receive one.
+ * 1. Rx ring will coming to the tail.
+ * 2. Do not need to receive 4 packets.
+ * 3. If pointer address unaligned on 32-bit boundary.
+ * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+ */
+ if ((rxq->rx_count - rxq->rd_p) < 4 ||
+ (nb_pkts - avail) < 4 ||
+ ((uintptr_t)rxds & 0x1F) != 0 ||
+ !burst_receive) {
+ _mm_storel_epi64((void *)&rx_pkts[avail],
+ _mm_loadu_si128((void *)rxb));
+
+ /* Allocate a new mbuf into the software ring. */
+ if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+ PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+ rxq->port_id, rxq->qidx);
+ nfp_net_mbuf_alloc_failed(rxq);
+ goto recv_end;
+ }
+
+ nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+ avail++;
+ nb_hold++;
+ continue;
+ }
+
+ burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+ if (!burst_receive)
+ continue;
+
+ _mm256_storeu_si256((void *)&rx_pkts[avail],
+ _mm256_loadu_si256((void *)rxb));
+
+ /* Allocate 4 new mbufs into the software ring. */
+ if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+ burst_receive = false;
+ continue;
+ }
+
+ nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+ avail += 4;
+ nb_hold += 4;
+ }
+
+recv_end:
+ if (nb_hold == 0)
+ return nb_hold;
+
+ PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+ rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+ nb_hold += rxq->nb_rx_hold;
+
+ /*
+ * FL descriptors needs to be written before incrementing the
+ * FL queue WR pointer
+ */
+ rte_wmb();
+ if (nb_hold > rxq->rx_free_thresh) {
+ PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+ rxq->port_id, rxq->qidx, nb_hold, avail);
+ nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+ nb_hold = 0;
+ }
+ rxq->nb_rx_hold = nb_hold;
+
+ return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
#include <stdbool.h>
#include <rte_common.h>
+#include <rte_mbuf_core.h>
#include "nfp_rxtx_vec.h"
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
{
return false;
}
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+ __rte_unused struct rte_mbuf **rx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype
2024-07-08 5:58 ` [PATCH v2 " Chaoyong He
` (2 preceding siblings ...)
2024-07-08 5:58 ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-07-08 5:58 ` Chaoyong He
2024-07-08 11:45 ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08 5:58 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Vector AVX2 Rx function supports parsing packet type and set it to mbuf.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfp_net_common.c | 2 +-
drivers/net/nfp/nfp_rxtx.c | 2 +-
drivers/net/nfp/nfp_rxtx.h | 3 +++
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
4 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 08693d5fba..3d916cd147 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1455,7 +1455,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
RTE_PTYPE_INNER_L4_SCTP,
};
- if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+ if (dev->rx_pkt_burst == NULL)
return NULL;
net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
* @param mb
* Mbuf to set the packet type.
*/
-static void
+void
nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
struct nfp_net_rx_desc *rxds,
struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
uint16_t queue_id,
struct rte_eth_txq_info *qinfo);
void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *mb);
#endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+ nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
/* Checking the checksum flag */
nfp_net_rx_cksum(rxq, rxds, rx_pkt);
}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v2 0/4] support AVX2 instruction Rx/Tx function
2024-07-08 5:58 ` [PATCH v2 " Chaoyong He
` (3 preceding siblings ...)
2024-07-08 5:58 ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-08 11:45 ` Ferruh Yigit
2024-07-09 1:13 ` Chaoyong He
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
5 siblings, 1 reply; 26+ messages in thread
From: Ferruh Yigit @ 2024-07-08 11:45 UTC (permalink / raw)
To: Chaoyong He, dev; +Cc: oss-drivers
On 7/8/2024 6:58 AM, Chaoyong He wrote:
> This patch series add the support of Rx/Tx function using the
> AVX2 instruction.
>
> ---
> v2: rebase to the latest main branch.
> ---
>
> Long Wu (4):
> net/nfp: export more interfaces of NFDk
> net/nfp: support AVX2 Tx function
> net/nfp: support AVX2 Rx function
> net/nfp: vector Rx function supports parsing ptype
>
I can see still fails to apply to main, I guess because of the trace
patchset, let me go one by one, first I will get trace one and later
will ask for a new version of this set again with a rebase.
^ permalink raw reply [flat|nested] 26+ messages in thread
* RE: [PATCH v2 0/4] support AVX2 instruction Rx/Tx function
2024-07-08 11:45 ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
@ 2024-07-09 1:13 ` Chaoyong He
0 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 1:13 UTC (permalink / raw)
To: Ferruh Yigit, dev; +Cc: oss-drivers
> On 7/8/2024 6:58 AM, Chaoyong He wrote:
> > This patch series add the support of Rx/Tx function using the
> > AVX2 instruction.
> >
> > ---
> > v2: rebase to the latest main branch.
> > ---
> >
> > Long Wu (4):
> > net/nfp: export more interfaces of NFDk
> > net/nfp: support AVX2 Tx function
> > net/nfp: support AVX2 Rx function
> > net/nfp: vector Rx function supports parsing ptype
> >
>
> I can see still fails to apply to main, I guess because of the trace patchset, let
> me go one by one, first I will get trace one and later will ask for a new version
> of this set again with a rebase.
We have also found the 'static_rte_common_nfp' problem you mentioned in the previous version patch series in our local CI environment.
We will make sure it be solved before we send out a new version patch series.
Thanks.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3 0/4] support AVX2 instruction Rx/Tx function
2024-07-08 5:58 ` [PATCH v2 " Chaoyong He
` (4 preceding siblings ...)
2024-07-08 11:45 ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
@ 2024-07-09 7:29 ` Chaoyong He
2024-07-09 7:29 ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
` (4 more replies)
5 siblings, 5 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 7:29 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Chaoyong He
This patch series add the support of Rx/Tx function using the
AVX2 instruction.
---
v3:
* Fix the 'meson.build' file to solve the compile problem.
v2:
* Rebase to the latest main branch.
---
Long Wu (4):
net/nfp: export more interfaces of NFDk
net/nfp: support AVX2 Tx function
net/nfp: support AVX2 Rx function
net/nfp: vector Rx function supports parsing ptype
drivers/net/nfp/meson.build | 20 +
drivers/net/nfp/nfdk/nfp_nfdk.h | 46 +++
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 55 +--
drivers/net/nfp/nfdk/nfp_nfdk_vec.h | 36 ++
drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c | 14 +
drivers/net/nfp/nfp_ethdev.c | 5 +-
drivers/net/nfp/nfp_ethdev_vf.c | 5 +-
drivers/net/nfp/nfp_net_common.c | 2 +-
drivers/net/nfp/nfp_net_meta.c | 1 +
drivers/net/nfp/nfp_rxtx.c | 12 +-
drivers/net/nfp/nfp_rxtx.h | 9 +-
drivers/net/nfp/nfp_rxtx_vec.h | 17 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 275 +++++++++++++
drivers/net/nfp/nfp_rxtx_vec_stub.c | 25 ++
15 files changed, 905 insertions(+), 49 deletions(-)
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3 1/4] net/nfp: export more interfaces of NFDk
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
@ 2024-07-09 7:29 ` Chaoyong He
2024-07-09 7:29 ` [PATCH v3 2/4] net/nfp: support AVX2 Tx function Chaoyong He
` (3 subsequent siblings)
4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 7:29 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfdk/nfp_nfdk.h | 45 ++++++++++++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
2 files changed, 46 insertions(+), 42 deletions(-)
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
#ifndef __NFP_NFDK_H__
#define __NFP_NFDK_H__
+#include <nfp_platform.h>
+
#include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
#define NFDK_TX_DESC_PER_SIMPLE_PKT 2
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
}
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+ /* First descriptor fits less data, so adjust for that */
+ return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+ NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+ struct rte_mbuf *mb,
+ uint64_t flags)
+{
+ uint64_t ol_flags;
+ struct nfp_net_hw *hw = txq->hw;
+
+ if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+ return flags;
+
+ ol_flags = mb->ol_flags;
+
+ /* Set TCP csum offload if TSO enabled. */
+ if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+ flags |= NFDK_DESC_TX_L4_CSUM;
+
+ if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+ flags |= NFDK_DESC_TX_ENCAP;
+
+ /* IPv6 does not need checksum */
+ if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+ flags |= NFDK_DESC_TX_L3_CSUM;
+
+ if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+ flags |= NFDK_DESC_TX_L4_CSUM;
+
+ return flags;
+}
+
uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
uint32_t port_id);
uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
const struct rte_eth_txconf *tx_conf);
int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+ struct nfp_net_txq *txq,
+ uint64_t *metadata);
#endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 8b8c191b82..173aabf0b9 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
#include "nfp_nfdk.h"
#include <bus_pci_driver.h>
-#include <nfp_platform.h>
#include <rte_malloc.h>
#include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
#define NFDK_TX_DESC_GATHER_MAX 17
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
- struct rte_mbuf *mb,
- uint64_t flags)
-{
- uint64_t ol_flags;
- struct nfp_net_hw *hw = txq->hw;
-
- if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
- return flags;
-
- ol_flags = mb->ol_flags;
-
- /* Set L4 csum offload if TSO/UFO enabled. */
- if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
- (ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
- flags |= NFDK_DESC_TX_L4_CSUM;
-
- if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
- flags |= NFDK_DESC_TX_ENCAP;
-
- /* IPv6 does not need checksum */
- if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
- flags |= NFDK_DESC_TX_L3_CSUM;
-
- if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
- flags |= NFDK_DESC_TX_L4_CSUM;
-
- return flags;
-}
-
/* Set TX descriptor for TSO of nfdk */
static uint64_t
nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
return FLOWER_PKT_DATA_OFFSET;
}
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
- /* First descriptor fits less data, so adjust for that */
- return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
- NFDK_TX_MAX_DATA_PER_DESC);
-}
-
static inline void
nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
return nop_slots;
}
-static int
+int
nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
struct nfp_net_txq *txq,
uint64_t *metadata)
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3 2/4] net/nfp: support AVX2 Tx function
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
2024-07-09 7:29 ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-07-09 7:29 ` Chaoyong He
2024-07-09 7:29 ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
` (2 subsequent siblings)
4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 7:29 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/meson.build | 20 +
drivers/net/nfp/nfdk/nfp_nfdk.h | 1 +
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 12 +
drivers/net/nfp/nfdk/nfp_nfdk_vec.h | 36 ++
drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c | 14 +
drivers/net/nfp/nfp_ethdev.c | 3 +-
drivers/net/nfp/nfp_ethdev_vf.c | 3 +-
drivers/net/nfp/nfp_rxtx.h | 5 +-
drivers/net/nfp/nfp_rxtx_vec.h | 13 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 21 +
drivers/net/nfp/nfp_rxtx_vec_stub.c | 16 +
12 files changed, 573 insertions(+), 3 deletions(-)
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c
diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..463a482a32 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
'flower/nfp_flower_service.c',
'nfd3/nfp_nfd3_dp.c',
'nfdk/nfp_nfdk_dp.c',
+ 'nfdk/nfp_nfdk_vec_stub.c',
'nfpcore/nfp_cppcore.c',
'nfpcore/nfp_crc.c',
'nfpcore/nfp_elf.c',
@@ -43,7 +44,26 @@ sources = files(
'nfp_net_flow.c',
'nfp_net_meta.c',
'nfp_rxtx.c',
+ 'nfp_rxtx_vec_stub.c',
'nfp_service.c',
)
+if arch_subdir == 'x86'
+ includes += include_directories('../../common/nfp')
+
+ avx2_sources = files(
+ 'nfdk/nfp_nfdk_vec_avx2_dp.c',
+ 'nfp_rxtx_vec_avx2.c',
+ )
+
+ nfp_avx2_lib = static_library('nfp_avx2_lib',
+ avx2_sources,
+ dependencies: [static_rte_ethdev, static_rte_bus_pci],
+ include_directories: includes,
+ c_args: [cflags, '-mavx2']
+ )
+
+ objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
struct nfp_net_txq *txq,
uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
#endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
#include "../flower/nfp_flower.h"
#include "../nfp_logs.h"
#include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
#define NFDK_TX_DESC_GATHER_MAX 17
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
dev->data->tx_queues[queue_idx] = txq;
txq->hw = hw;
txq->hw_priv = dev->process_private;
+ txq->simple_always = true;
/*
* Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
return 0;
}
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+ if (nfp_net_get_avx2_supported())
+ eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+ else
+ eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+ struct nfp_net_hw *hw)
+{
+ if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+ return false;
+
+ if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+ return true;
+
+ if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+ return true;
+
+ return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+ (NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+ struct nfp_net_txq *txq,
+ uint64_t *des_addr,
+ uint64_t *des_meta,
+ bool repr_flag)
+{
+ int ret;
+ __m128i dma_addr;
+ __m128i dma_hi;
+ __m128i data_off;
+ __m128i dlen_type;
+ uint64_t metadata;
+
+ if (repr_flag) {
+ metadata = NFDK_DESC_TX_CHAIN_META;
+ } else {
+ ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+ if (unlikely(ret != 0))
+ return ret;
+ }
+
+ data_off = _mm_set_epi64x(0, pkt->data_off);
+ dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+ dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+ dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+ *des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+ _mm_slli_epi64(dlen_type, 16)), 0);
+
+ *des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+ return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+ struct nfp_net_nfdk_tx_desc *txds,
+ struct rte_mbuf *pkt,
+ bool repr_flag)
+{
+ int ret;
+ __m128i des_data;
+ uint64_t des_addr;
+ uint64_t des_meta;
+
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+ &des_meta, repr_flag);
+ if (unlikely(ret != 0))
+ return ret;
+
+ txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+ if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+ txq->data_pending += pkt->data_len;
+ else
+ txq->data_pending = 0;
+
+ des_data = _mm_set_epi64x(des_meta, des_addr);
+
+ _mm_store_si128((void *)txds, des_data);
+
+ return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+ struct nfp_net_nfdk_tx_desc *txds,
+ struct rte_mbuf **pkt,
+ bool repr_flag)
+{
+ int ret;
+ uint16_t i;
+ __m256i des_data0_1;
+ __m256i des_data2_3;
+ uint64_t des_addr[4];
+ uint64_t des_meta[4];
+
+ for (i = 0; i < 4; i++) {
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+ &des_addr[i], &des_meta[i], repr_flag);
+ if (unlikely(ret != 0))
+ return ret;
+ }
+
+ for (i = 0; i < 4; i++) {
+ txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+ if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+ txq->data_pending += pkt[i]->data_len;
+ else
+ txq->data_pending = 0;
+ }
+
+ des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+ des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+ _mm256_store_si256((void *)txds, des_data0_1);
+ _mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+ return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+ struct rte_mbuf **tx_pkts)
+{
+ __m256i mbuf_room0_1;
+ __m256i mbuf_room2_3;
+
+ mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+ (uintptr_t)tx_pkts[0]);
+ mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+ (uintptr_t)tx_pkts[2]);
+
+ _mm256_store_si256((void *)mbuf, mbuf_room0_1);
+ _mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts,
+ uint16_t simple_close,
+ bool repr_flag)
+{
+ int ret;
+ uint16_t npkts = 0;
+ uint16_t need_txds;
+ uint16_t free_descs;
+ struct rte_mbuf **lmbuf;
+ struct nfp_net_nfdk_tx_desc *ktxds;
+
+ PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+ txq->qidx, txq->wr_p, nb_pkts);
+
+ need_txds = nb_pkts << 1;
+ if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+ nfp_net_tx_free_bufs(txq);
+
+ free_descs = nfp_net_nfdk_free_tx_desc(txq);
+ if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+ if (unlikely(simple_close > 0))
+ goto xmit_end;
+
+ return 0;
+ }
+
+ PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+ /* Sending packets */
+ while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+ ktxds = &txq->ktxds[txq->wr_p];
+ lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+ /*
+ * If can not send burst, just send one.
+ * 1. Tx ring will come to the tail.
+ * 2. Do not need to send 4 packets.
+ * 3. If pointer address unaligned on 32-bit boundary.
+ * 4. If free descriptors are not enough.
+ */
+ if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+ (nb_pkts - npkts) < 4 ||
+ ((uintptr_t)ktxds & 0x1F) != 0 ||
+ free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+ ktxds, tx_pkts[npkts], repr_flag);
+ if (unlikely(ret != 0))
+ goto xmit_end;
+
+ rte_pktmbuf_free(*lmbuf);
+
+ _mm_storel_epi64((void *)lmbuf,
+ _mm_loadu_si128((void *)&tx_pkts[npkts]));
+ npkts++;
+ free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+ continue;
+ }
+
+ ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+ &tx_pkts[npkts], repr_flag);
+ if (unlikely(ret != 0))
+ goto xmit_end;
+
+ rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+ nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+ npkts += 4;
+ free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+ }
+
+xmit_end:
+ /* Increment write pointers. Force memory write before we let HW know */
+ rte_wmb();
+ nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+ return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+ uint16_t *simple_close)
+{
+ uint16_t i;
+ uint16_t wr_p;
+ uint16_t nop_slots;
+ __m128i zero_128 = _mm_setzero_si128();
+ __m256i zero_256 = _mm256_setzero_si256();
+
+ wr_p = txq->wr_p;
+ nop_slots = D_BLOCK_CPL(wr_p);
+
+ for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+ _mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+ rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+ _mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+ }
+
+ for (; i >= 2; i -= 2, wr_p += 2) {
+ _mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+ _mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+ }
+
+ for (; i >= 1; i--, wr_p++) {
+ _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+ _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+ }
+
+ txq->data_pending = 0;
+ txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+ (*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+ uint16_t *simple_close)
+{
+ uint16_t wr_p;
+ __m128i zero_128 = _mm_setzero_si128();
+
+ wr_p = txq->wr_p;
+
+ _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+ _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+ txq->wr_p = D_IDX(txq, wr_p + 1);
+ (*simple_close)++;
+
+ return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+ struct rte_mbuf *pkt,
+ bool *simple_flag,
+ bool *pending_flag,
+ uint16_t *data_pending,
+ uint32_t *wr_p,
+ uint16_t *simple_close)
+{
+ uint32_t data_pending_temp;
+
+ /* Let the first descriptor index even before send simple packets */
+ if (!(*simple_flag)) {
+ if ((*wr_p & 0x1) == 0x1)
+ *wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+ *simple_flag = true;
+ }
+
+ /* Simple packets only need one close block operation */
+ if (!(*pending_flag)) {
+ if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+ *pending_flag = true;
+ return;
+ }
+
+ data_pending_temp = *data_pending + pkt->data_len;
+ if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+ nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+ *pending_flag = true;
+ return;
+ }
+
+ *data_pending = data_pending_temp;
+
+ *wr_p += 2;
+ }
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t head,
+ uint16_t nb_pkts,
+ uint16_t *simple_close)
+{
+ uint32_t wr_p;
+ uint16_t simple_idx;
+ struct rte_mbuf *pkt;
+ uint16_t data_pending;
+ bool simple_flag = false;
+ bool pending_flag = false;
+ uint16_t simple_count = 0;
+
+ *simple_close = 0;
+ wr_p = txq->wr_p;
+ data_pending = txq->data_pending;
+
+ for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+ pkt = tx_pkts[simple_idx];
+ if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+ break;
+
+ simple_count++;
+ if (!txq->simple_always)
+ nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+ &pending_flag, &data_pending, &wr_p, simple_close);
+ }
+
+ return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t head,
+ uint16_t nb_pkts)
+{
+ uint16_t others_idx;
+ struct rte_mbuf *pkt;
+ uint16_t others_count = 0;
+
+ for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+ pkt = tx_pkts[others_idx];
+ if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+ break;
+
+ others_count++;
+ }
+
+ return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t i;
+ uint16_t avail = 0;
+ uint16_t simple_close;
+ uint16_t simple_count;
+ uint16_t simple_avail;
+ uint16_t others_count;
+ uint16_t others_avail;
+ struct nfp_net_txq *txq = tx_queue;
+
+ for (i = 0; i < nb_pkts; i++) {
+ simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+ nb_pkts, &simple_close);
+ if (simple_count > 0) {
+ if (!txq->simple_always)
+ txq->simple_always = true;
+
+ simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+ tx_pkts + i, simple_count, simple_close,
+ false);
+
+ avail += simple_avail;
+ if (simple_avail != simple_count)
+ break;
+
+ i += simple_count;
+ }
+
+ if (i == nb_pkts)
+ break;
+
+ others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+ i, nb_pkts);
+
+ if (txq->simple_always)
+ txq->simple_always = false;
+
+ others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+ tx_pkts + i, others_count, false);
+
+ avail += others_avail;
+ if (others_avail != others_count)
+ break;
+
+ i += others_count;
+ }
+
+ return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+ __rte_unused struct rte_mbuf **tx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
#include "nfp_ipsec.h"
#include "nfp_logs.h"
#include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
/* 64-bit per app capabilities */
#define NFP_NET_APP_CAP_SP_INDIFF RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
else
- eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+ nfp_net_nfdk_xmit_pkts_set(eth_dev);
eth_dev->dev_ops = &nfp_net_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
#include "nfp_logs.h"
#include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
#define NFP_VF_DRIVER_NAME net_nfp_vf
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
else
- eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+ nfp_net_nfdk_xmit_pkts_set(eth_dev);
eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
/** Used by NFDk only */
uint16_t data_pending;
+ /** Used by NFDk vector xmit only */
+ bool simple_always;
+
/**
* At this point 58 bytes have been used for all the fields in the
- * TX critical path. We have room for 6 bytes and still all placed
+ * TX critical path. We have room for 5 bytes and still all placed
* in a cache line.
*/
uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+ if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+ rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+ return true;
+
+ return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+ return false;
+}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3 3/4] net/nfp: support AVX2 Rx function
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
2024-07-09 7:29 ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09 7:29 ` [PATCH v3 2/4] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-07-09 7:29 ` Chaoyong He
2024-07-09 7:29 ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 7:29 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.
Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfp_ethdev.c | 2 +-
drivers/net/nfp/nfp_ethdev_vf.c | 2 +-
drivers/net/nfp/nfp_net_meta.c | 1 +
drivers/net/nfp/nfp_rxtx.c | 10 ++
drivers/net/nfp/nfp_rxtx.h | 1 +
drivers/net/nfp/nfp_rxtx_vec.h | 4 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
drivers/net/nfp/nfp_rxtx_vec_stub.c | 9 +
8 files changed, 279 insertions(+), 2 deletions(-)
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
eth_dev->dev_ops = &nfp_net_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
- eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+ nfp_net_recv_pkts_set(eth_dev);
}
static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
- eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+ nfp_net_recv_pkts_set(eth_dev);
}
static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
rte_be32_t meta_header,
struct nfp_net_meta_parsed *meta)
{
+ meta->flags = 0;
meta->flags |= (1 << NFP_NET_META_HASH);
meta->hash_type = rte_be_to_cpu_32(meta_header);
meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
#include "nfp_ipsec.h"
#include "nfp_logs.h"
#include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
/*
* The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
info->conf.offloads = dev_info.tx_offload_capa &
dev->data->dev_conf.txmode.offloads;
}
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+ if (nfp_net_get_avx2_supported())
+ eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+ else
+ eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
uint16_t queue_id,
struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
#endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
bool nfp_net_get_avx2_supported(void);
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
+
#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
#include <stdbool.h>
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
#include <rte_cpuflags.h>
#include <rte_vect.h>
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
#include "nfp_rxtx_vec.h"
bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
return false;
}
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rxb)
+{
+ __m128i dma;
+ __m128i dma_hi;
+ __m128i vaddr0;
+ __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ _mm_storel_epi64((void *)rxds, vaddr0);
+
+ rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf **rxb)
+{
+ __m128i dma;
+ __m128i dma_hi;
+ __m128i vaddr0;
+ __m128i vaddr1;
+ __m128i vaddr2;
+ __m128i vaddr3;
+ __m128i vaddr0_1;
+ __m128i vaddr2_3;
+ __m256i vaddr0_3;
+ __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+ vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+ vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+ vaddr2_3, 1);
+
+ _mm256_store_si256((void *)rxds, vaddr0_3);
+
+ rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rx_pkt)
+{
+ struct nfp_net_hw *hw = rxq->hw;
+ struct nfp_net_meta_parsed meta;
+
+ rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+ /* Size of the whole packet. We just support 1 segment */
+ rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+ /* Filling the received mbuf with packet info */
+ if (hw->rx_offset)
+ rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+ else
+ rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+ rx_pkt->port = rxq->port_id;
+ rx_pkt->nb_segs = 1;
+ rx_pkt->next = NULL;
+
+ nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+ /* Checking the checksum flag */
+ nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rxb,
+ struct rte_mbuf *rx_pkt)
+{
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+ nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf **rxb,
+ struct rte_mbuf **rx_pkts)
+{
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+ nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+ __m256i data = _mm256_loadu_si256((void *)rxds);
+
+ if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+ return false;
+
+ return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t avail;
+ uint16_t nb_hold;
+ bool burst_receive;
+ struct rte_mbuf **rxb;
+ struct nfp_net_rx_desc *rxds;
+ struct nfp_net_rxq *rxq = rx_queue;
+
+ if (unlikely(rxq == NULL)) {
+ PMD_RX_LOG(ERR, "RX Bad queue");
+ return 0;
+ }
+
+ avail = 0;
+ nb_hold = 0;
+ burst_receive = true;
+ while (avail < nb_pkts) {
+ rxds = &rxq->rxds[rxq->rd_p];
+ rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+ if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+ & PCIE_DESC_RX_DD) == 0)
+ goto recv_end;
+
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+ if ((rxq->rd_p & 0x3) == 0) {
+ rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+ rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+ }
+
+ if ((rxq->rd_p & 0x7) == 0) {
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+ }
+
+ /*
+ * If can not receive burst, just receive one.
+ * 1. Rx ring will coming to the tail.
+ * 2. Do not need to receive 4 packets.
+ * 3. If pointer address unaligned on 32-bit boundary.
+ * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+ */
+ if ((rxq->rx_count - rxq->rd_p) < 4 ||
+ (nb_pkts - avail) < 4 ||
+ ((uintptr_t)rxds & 0x1F) != 0 ||
+ !burst_receive) {
+ _mm_storel_epi64((void *)&rx_pkts[avail],
+ _mm_loadu_si128((void *)rxb));
+
+ /* Allocate a new mbuf into the software ring. */
+ if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+ PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+ rxq->port_id, rxq->qidx);
+ nfp_net_mbuf_alloc_failed(rxq);
+ goto recv_end;
+ }
+
+ nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+ avail++;
+ nb_hold++;
+ continue;
+ }
+
+ burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+ if (!burst_receive)
+ continue;
+
+ _mm256_storeu_si256((void *)&rx_pkts[avail],
+ _mm256_loadu_si256((void *)rxb));
+
+ /* Allocate 4 new mbufs into the software ring. */
+ if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+ burst_receive = false;
+ continue;
+ }
+
+ nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+ avail += 4;
+ nb_hold += 4;
+ }
+
+recv_end:
+ if (nb_hold == 0)
+ return nb_hold;
+
+ PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+ rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+ nb_hold += rxq->nb_rx_hold;
+
+ /*
+ * FL descriptors needs to be written before incrementing the
+ * FL queue WR pointer
+ */
+ rte_wmb();
+ if (nb_hold > rxq->rx_free_thresh) {
+ PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+ rxq->port_id, rxq->qidx, nb_hold, avail);
+ nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+ nb_hold = 0;
+ }
+ rxq->nb_rx_hold = nb_hold;
+
+ return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
#include <stdbool.h>
#include <rte_common.h>
+#include <rte_mbuf_core.h>
#include "nfp_rxtx_vec.h"
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
{
return false;
}
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+ __rte_unused struct rte_mbuf **rx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
` (2 preceding siblings ...)
2024-07-09 7:29 ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-07-09 7:29 ` Chaoyong He
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 7:29 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Vector AVX2 Rx function supports parsing packet type and set it to mbuf.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfp_net_common.c | 2 +-
drivers/net/nfp/nfp_rxtx.c | 2 +-
drivers/net/nfp/nfp_rxtx.h | 3 +++
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
4 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 08693d5fba..3d916cd147 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1455,7 +1455,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
RTE_PTYPE_INNER_L4_SCTP,
};
- if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+ if (dev->rx_pkt_burst == NULL)
return NULL;
net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
* @param mb
* Mbuf to set the packet type.
*/
-static void
+void
nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
struct nfp_net_rx_desc *rxds,
struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
uint16_t queue_id,
struct rte_eth_txq_info *qinfo);
void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *mb);
#endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+ nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
/* Checking the checksum flag */
nfp_net_rx_cksum(rxq, rxds, rx_pkt);
}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4 0/5] support AVX2 instruction Rx/Tx function
2024-07-09 7:29 ` [PATCH v3 " Chaoyong He
` (3 preceding siblings ...)
2024-07-09 7:29 ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-09 8:24 ` Chaoyong He
2024-07-09 8:24 ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
` (5 more replies)
4 siblings, 6 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 8:24 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Chaoyong He
This patch series add the support of Rx/Tx function using the
AVX2 instruction.
---
v4:
* Add commit to solve the compile problem on 32-bit OS.
v3:
* Fix the 'meson.build' file to solve the compile problem.
v2:
* Rebase to the latest main branch.
---
Long Wu (5):
net/nfp: fix compile fail on 32-bit OS
net/nfp: export more interfaces of NFDk
net/nfp: support AVX2 Tx function
net/nfp: support AVX2 Rx function
net/nfp: vector Rx function supports parsing ptype
drivers/net/nfp/meson.build | 21 +
drivers/net/nfp/nfdk/nfp_nfdk.h | 46 +++
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 55 +--
drivers/net/nfp/nfdk/nfp_nfdk_vec.h | 36 ++
drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c | 14 +
drivers/net/nfp/nfp_ethdev.c | 5 +-
drivers/net/nfp/nfp_ethdev_vf.c | 5 +-
drivers/net/nfp/nfp_net_common.c | 2 +-
drivers/net/nfp/nfp_net_meta.c | 1 +
drivers/net/nfp/nfp_rxtx.c | 12 +-
drivers/net/nfp/nfp_rxtx.h | 9 +-
drivers/net/nfp/nfp_rxtx_vec.h | 17 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 275 +++++++++++++
drivers/net/nfp/nfp_rxtx_vec_stub.c | 25 ++
15 files changed, 906 insertions(+), 49 deletions(-)
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
@ 2024-07-09 8:24 ` Chaoyong He
2024-07-09 8:24 ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
` (4 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 8:24 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, stable, Chaoyong He, Peng Zhang
From: Long Wu <long.wu@corigine.com>
NFP PMD only support compile on 64-bit linux OS, add exit logic in
other conditions.
Fixes: 8741a9074536 ("net/nfp: disable for 32-bit meson builds")
Cc: stable@dpdk.org
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
Reviewed-by: Peng Zhang <peng.zhang@corigine.com>
---
drivers/net/nfp/meson.build | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..7216c8dff9 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -4,6 +4,7 @@
if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
build = false
reason = 'only supported on 64-bit Linux'
+ subdir_done()
endif
sources = files(
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4 2/5] net/nfp: export more interfaces of NFDk
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
2024-07-09 8:24 ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
@ 2024-07-09 8:24 ` Chaoyong He
2024-07-09 8:24 ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
` (3 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 8:24 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfdk/nfp_nfdk.h | 45 ++++++++++++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
2 files changed, 46 insertions(+), 42 deletions(-)
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
#ifndef __NFP_NFDK_H__
#define __NFP_NFDK_H__
+#include <nfp_platform.h>
+
#include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
#define NFDK_TX_DESC_PER_SIMPLE_PKT 2
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
}
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+ /* First descriptor fits less data, so adjust for that */
+ return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+ NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+ struct rte_mbuf *mb,
+ uint64_t flags)
+{
+ uint64_t ol_flags;
+ struct nfp_net_hw *hw = txq->hw;
+
+ if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+ return flags;
+
+ ol_flags = mb->ol_flags;
+
+ /* Set TCP csum offload if TSO enabled. */
+ if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+ flags |= NFDK_DESC_TX_L4_CSUM;
+
+ if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+ flags |= NFDK_DESC_TX_ENCAP;
+
+ /* IPv6 does not need checksum */
+ if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+ flags |= NFDK_DESC_TX_L3_CSUM;
+
+ if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+ flags |= NFDK_DESC_TX_L4_CSUM;
+
+ return flags;
+}
+
uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
uint32_t port_id);
uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
const struct rte_eth_txconf *tx_conf);
int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+ struct nfp_net_txq *txq,
+ uint64_t *metadata);
#endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 8b8c191b82..173aabf0b9 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
#include "nfp_nfdk.h"
#include <bus_pci_driver.h>
-#include <nfp_platform.h>
#include <rte_malloc.h>
#include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
#define NFDK_TX_DESC_GATHER_MAX 17
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
- struct rte_mbuf *mb,
- uint64_t flags)
-{
- uint64_t ol_flags;
- struct nfp_net_hw *hw = txq->hw;
-
- if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
- return flags;
-
- ol_flags = mb->ol_flags;
-
- /* Set L4 csum offload if TSO/UFO enabled. */
- if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
- (ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
- flags |= NFDK_DESC_TX_L4_CSUM;
-
- if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
- flags |= NFDK_DESC_TX_ENCAP;
-
- /* IPv6 does not need checksum */
- if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
- flags |= NFDK_DESC_TX_L3_CSUM;
-
- if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
- flags |= NFDK_DESC_TX_L4_CSUM;
-
- return flags;
-}
-
/* Set TX descriptor for TSO of nfdk */
static uint64_t
nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
return FLOWER_PKT_DATA_OFFSET;
}
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
- /* First descriptor fits less data, so adjust for that */
- return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
- NFDK_TX_MAX_DATA_PER_DESC);
-}
-
static inline void
nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
return nop_slots;
}
-static int
+int
nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
struct nfp_net_txq *txq,
uint64_t *metadata)
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4 3/5] net/nfp: support AVX2 Tx function
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
2024-07-09 8:24 ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
2024-07-09 8:24 ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-07-09 8:24 ` Chaoyong He
2024-07-09 8:24 ` [PATCH v4 4/5] net/nfp: support AVX2 Rx function Chaoyong He
` (2 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 8:24 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/meson.build | 20 +
drivers/net/nfp/nfdk/nfp_nfdk.h | 1 +
drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 12 +
drivers/net/nfp/nfdk/nfp_nfdk_vec.h | 36 ++
drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c | 14 +
drivers/net/nfp/nfp_ethdev.c | 3 +-
drivers/net/nfp/nfp_ethdev_vf.c | 3 +-
drivers/net/nfp/nfp_rxtx.h | 5 +-
drivers/net/nfp/nfp_rxtx_vec.h | 13 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 21 +
drivers/net/nfp/nfp_rxtx_vec_stub.c | 16 +
12 files changed, 573 insertions(+), 3 deletions(-)
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c
diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index 7216c8dff9..58a066c2e3 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -17,6 +17,7 @@ sources = files(
'flower/nfp_flower_service.c',
'nfd3/nfp_nfd3_dp.c',
'nfdk/nfp_nfdk_dp.c',
+ 'nfdk/nfp_nfdk_vec_stub.c',
'nfpcore/nfp_cppcore.c',
'nfpcore/nfp_crc.c',
'nfpcore/nfp_elf.c',
@@ -44,7 +45,26 @@ sources = files(
'nfp_net_flow.c',
'nfp_net_meta.c',
'nfp_rxtx.c',
+ 'nfp_rxtx_vec_stub.c',
'nfp_service.c',
)
+if arch_subdir == 'x86'
+ includes += include_directories('../../common/nfp')
+
+ avx2_sources = files(
+ 'nfdk/nfp_nfdk_vec_avx2_dp.c',
+ 'nfp_rxtx_vec_avx2.c',
+ )
+
+ nfp_avx2_lib = static_library('nfp_avx2_lib',
+ avx2_sources,
+ dependencies: [static_rte_ethdev, static_rte_bus_pci],
+ include_directories: includes,
+ c_args: [cflags, '-mavx2']
+ )
+
+ objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
struct nfp_net_txq *txq,
uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
#endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
#include "../flower/nfp_flower.h"
#include "../nfp_logs.h"
#include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
#define NFDK_TX_DESC_GATHER_MAX 17
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
dev->data->tx_queues[queue_idx] = txq;
txq->hw = hw;
txq->hw_priv = dev->process_private;
+ txq->simple_always = true;
/*
* Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
return 0;
}
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+ if (nfp_net_get_avx2_supported())
+ eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+ else
+ eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+ struct nfp_net_hw *hw)
+{
+ if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+ return false;
+
+ if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+ return true;
+
+ if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+ return true;
+
+ return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+ (NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+ struct nfp_net_txq *txq,
+ uint64_t *des_addr,
+ uint64_t *des_meta,
+ bool repr_flag)
+{
+ int ret;
+ __m128i dma_addr;
+ __m128i dma_hi;
+ __m128i data_off;
+ __m128i dlen_type;
+ uint64_t metadata;
+
+ if (repr_flag) {
+ metadata = NFDK_DESC_TX_CHAIN_META;
+ } else {
+ ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+ if (unlikely(ret != 0))
+ return ret;
+ }
+
+ data_off = _mm_set_epi64x(0, pkt->data_off);
+ dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+ dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+ dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+ *des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+ _mm_slli_epi64(dlen_type, 16)), 0);
+
+ *des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+ return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+ struct nfp_net_nfdk_tx_desc *txds,
+ struct rte_mbuf *pkt,
+ bool repr_flag)
+{
+ int ret;
+ __m128i des_data;
+ uint64_t des_addr;
+ uint64_t des_meta;
+
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+ &des_meta, repr_flag);
+ if (unlikely(ret != 0))
+ return ret;
+
+ txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+ if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+ txq->data_pending += pkt->data_len;
+ else
+ txq->data_pending = 0;
+
+ des_data = _mm_set_epi64x(des_meta, des_addr);
+
+ _mm_store_si128((void *)txds, des_data);
+
+ return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+ struct nfp_net_nfdk_tx_desc *txds,
+ struct rte_mbuf **pkt,
+ bool repr_flag)
+{
+ int ret;
+ uint16_t i;
+ __m256i des_data0_1;
+ __m256i des_data2_3;
+ uint64_t des_addr[4];
+ uint64_t des_meta[4];
+
+ for (i = 0; i < 4; i++) {
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+ &des_addr[i], &des_meta[i], repr_flag);
+ if (unlikely(ret != 0))
+ return ret;
+ }
+
+ for (i = 0; i < 4; i++) {
+ txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+ if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+ txq->data_pending += pkt[i]->data_len;
+ else
+ txq->data_pending = 0;
+ }
+
+ des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+ des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+ _mm256_store_si256((void *)txds, des_data0_1);
+ _mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+ return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+ struct rte_mbuf **tx_pkts)
+{
+ __m256i mbuf_room0_1;
+ __m256i mbuf_room2_3;
+
+ mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+ (uintptr_t)tx_pkts[0]);
+ mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+ (uintptr_t)tx_pkts[2]);
+
+ _mm256_store_si256((void *)mbuf, mbuf_room0_1);
+ _mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts,
+ uint16_t simple_close,
+ bool repr_flag)
+{
+ int ret;
+ uint16_t npkts = 0;
+ uint16_t need_txds;
+ uint16_t free_descs;
+ struct rte_mbuf **lmbuf;
+ struct nfp_net_nfdk_tx_desc *ktxds;
+
+ PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+ txq->qidx, txq->wr_p, nb_pkts);
+
+ need_txds = nb_pkts << 1;
+ if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+ nfp_net_tx_free_bufs(txq);
+
+ free_descs = nfp_net_nfdk_free_tx_desc(txq);
+ if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+ if (unlikely(simple_close > 0))
+ goto xmit_end;
+
+ return 0;
+ }
+
+ PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+ /* Sending packets */
+ while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+ ktxds = &txq->ktxds[txq->wr_p];
+ lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+ /*
+ * If can not send burst, just send one.
+ * 1. Tx ring will come to the tail.
+ * 2. Do not need to send 4 packets.
+ * 3. If pointer address unaligned on 32-bit boundary.
+ * 4. If free descriptors are not enough.
+ */
+ if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+ (nb_pkts - npkts) < 4 ||
+ ((uintptr_t)ktxds & 0x1F) != 0 ||
+ free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+ ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+ ktxds, tx_pkts[npkts], repr_flag);
+ if (unlikely(ret != 0))
+ goto xmit_end;
+
+ rte_pktmbuf_free(*lmbuf);
+
+ _mm_storel_epi64((void *)lmbuf,
+ _mm_loadu_si128((void *)&tx_pkts[npkts]));
+ npkts++;
+ free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+ continue;
+ }
+
+ ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+ &tx_pkts[npkts], repr_flag);
+ if (unlikely(ret != 0))
+ goto xmit_end;
+
+ rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+ nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+ npkts += 4;
+ free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+ }
+
+xmit_end:
+ /* Increment write pointers. Force memory write before we let HW know */
+ rte_wmb();
+ nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+ return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+ uint16_t *simple_close)
+{
+ uint16_t i;
+ uint16_t wr_p;
+ uint16_t nop_slots;
+ __m128i zero_128 = _mm_setzero_si128();
+ __m256i zero_256 = _mm256_setzero_si256();
+
+ wr_p = txq->wr_p;
+ nop_slots = D_BLOCK_CPL(wr_p);
+
+ for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+ _mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+ rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+ _mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+ }
+
+ for (; i >= 2; i -= 2, wr_p += 2) {
+ _mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+ _mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+ }
+
+ for (; i >= 1; i--, wr_p++) {
+ _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+ _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+ }
+
+ txq->data_pending = 0;
+ txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+ (*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+ uint16_t *simple_close)
+{
+ uint16_t wr_p;
+ __m128i zero_128 = _mm_setzero_si128();
+
+ wr_p = txq->wr_p;
+
+ _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+ rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+ _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+ txq->wr_p = D_IDX(txq, wr_p + 1);
+ (*simple_close)++;
+
+ return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+ struct rte_mbuf *pkt,
+ bool *simple_flag,
+ bool *pending_flag,
+ uint16_t *data_pending,
+ uint32_t *wr_p,
+ uint16_t *simple_close)
+{
+ uint32_t data_pending_temp;
+
+ /* Let the first descriptor index even before send simple packets */
+ if (!(*simple_flag)) {
+ if ((*wr_p & 0x1) == 0x1)
+ *wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+ *simple_flag = true;
+ }
+
+ /* Simple packets only need one close block operation */
+ if (!(*pending_flag)) {
+ if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+ *pending_flag = true;
+ return;
+ }
+
+ data_pending_temp = *data_pending + pkt->data_len;
+ if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+ nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+ *pending_flag = true;
+ return;
+ }
+
+ *data_pending = data_pending_temp;
+
+ *wr_p += 2;
+ }
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t head,
+ uint16_t nb_pkts,
+ uint16_t *simple_close)
+{
+ uint32_t wr_p;
+ uint16_t simple_idx;
+ struct rte_mbuf *pkt;
+ uint16_t data_pending;
+ bool simple_flag = false;
+ bool pending_flag = false;
+ uint16_t simple_count = 0;
+
+ *simple_close = 0;
+ wr_p = txq->wr_p;
+ data_pending = txq->data_pending;
+
+ for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+ pkt = tx_pkts[simple_idx];
+ if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+ break;
+
+ simple_count++;
+ if (!txq->simple_always)
+ nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+ &pending_flag, &data_pending, &wr_p, simple_close);
+ }
+
+ return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t head,
+ uint16_t nb_pkts)
+{
+ uint16_t others_idx;
+ struct rte_mbuf *pkt;
+ uint16_t others_count = 0;
+
+ for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+ pkt = tx_pkts[others_idx];
+ if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+ break;
+
+ others_count++;
+ }
+
+ return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t i;
+ uint16_t avail = 0;
+ uint16_t simple_close;
+ uint16_t simple_count;
+ uint16_t simple_avail;
+ uint16_t others_count;
+ uint16_t others_avail;
+ struct nfp_net_txq *txq = tx_queue;
+
+ for (i = 0; i < nb_pkts; i++) {
+ simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+ nb_pkts, &simple_close);
+ if (simple_count > 0) {
+ if (!txq->simple_always)
+ txq->simple_always = true;
+
+ simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+ tx_pkts + i, simple_count, simple_close,
+ false);
+
+ avail += simple_avail;
+ if (simple_avail != simple_count)
+ break;
+
+ i += simple_count;
+ }
+
+ if (i == nb_pkts)
+ break;
+
+ others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+ i, nb_pkts);
+
+ if (txq->simple_always)
+ txq->simple_always = false;
+
+ others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+ tx_pkts + i, others_count, false);
+
+ avail += others_avail;
+ if (others_avail != others_count)
+ break;
+
+ i += others_count;
+ }
+
+ return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+ __rte_unused struct rte_mbuf **tx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
#include "nfp_ipsec.h"
#include "nfp_logs.h"
#include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
/* 64-bit per app capabilities */
#define NFP_NET_APP_CAP_SP_INDIFF RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
else
- eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+ nfp_net_nfdk_xmit_pkts_set(eth_dev);
eth_dev->dev_ops = &nfp_net_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
#include "nfp_logs.h"
#include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
#define NFP_VF_DRIVER_NAME net_nfp_vf
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
else
- eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+ nfp_net_nfdk_xmit_pkts_set(eth_dev);
eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
/** Used by NFDk only */
uint16_t data_pending;
+ /** Used by NFDk vector xmit only */
+ bool simple_always;
+
/**
* At this point 58 bytes have been used for all the fields in the
- * TX critical path. We have room for 6 bytes and still all placed
+ * TX critical path. We have room for 5 bytes and still all placed
* in a cache line.
*/
uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+ if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+ rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+ return true;
+
+ return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+ return false;
+}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4 4/5] net/nfp: support AVX2 Rx function
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
` (2 preceding siblings ...)
2024-07-09 8:24 ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-07-09 8:24 ` Chaoyong He
2024-07-09 8:24 ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09 13:06 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 8:24 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.
Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfp_ethdev.c | 2 +-
drivers/net/nfp/nfp_ethdev_vf.c | 2 +-
drivers/net/nfp/nfp_net_meta.c | 1 +
drivers/net/nfp/nfp_rxtx.c | 10 ++
drivers/net/nfp/nfp_rxtx.h | 1 +
drivers/net/nfp/nfp_rxtx_vec.h | 4 +
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
drivers/net/nfp/nfp_rxtx_vec_stub.c | 9 +
8 files changed, 279 insertions(+), 2 deletions(-)
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
eth_dev->dev_ops = &nfp_net_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
- eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+ nfp_net_recv_pkts_set(eth_dev);
}
static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
eth_dev->rx_queue_count = nfp_net_rx_queue_count;
- eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+ nfp_net_recv_pkts_set(eth_dev);
}
static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
rte_be32_t meta_header,
struct nfp_net_meta_parsed *meta)
{
+ meta->flags = 0;
meta->flags |= (1 << NFP_NET_META_HASH);
meta->hash_type = rte_be_to_cpu_32(meta_header);
meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
#include "nfp_ipsec.h"
#include "nfp_logs.h"
#include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
/*
* The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
info->conf.offloads = dev_info.tx_offload_capa &
dev->data->dev_conf.txmode.offloads;
}
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+ if (nfp_net_get_avx2_supported())
+ eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+ else
+ eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
uint16_t queue_id,
struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
#endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
bool nfp_net_get_avx2_supported(void);
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
+
#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
#include <stdbool.h>
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
#include <rte_cpuflags.h>
#include <rte_vect.h>
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
#include "nfp_rxtx_vec.h"
bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
return false;
}
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rxb)
+{
+ __m128i dma;
+ __m128i dma_hi;
+ __m128i vaddr0;
+ __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ _mm_storel_epi64((void *)rxds, vaddr0);
+
+ rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf **rxb)
+{
+ __m128i dma;
+ __m128i dma_hi;
+ __m128i vaddr0;
+ __m128i vaddr1;
+ __m128i vaddr2;
+ __m128i vaddr3;
+ __m128i vaddr0_1;
+ __m128i vaddr2_3;
+ __m256i vaddr0_3;
+ __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+ dma_hi = _mm_srli_epi64(dma, 32);
+ vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+ vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+ vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+ vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+ vaddr2_3, 1);
+
+ _mm256_store_si256((void *)rxds, vaddr0_3);
+
+ rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rx_pkt)
+{
+ struct nfp_net_hw *hw = rxq->hw;
+ struct nfp_net_meta_parsed meta;
+
+ rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+ /* Size of the whole packet. We just support 1 segment */
+ rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+ /* Filling the received mbuf with packet info */
+ if (hw->rx_offset)
+ rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+ else
+ rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+ rx_pkt->port = rxq->port_id;
+ rx_pkt->nb_segs = 1;
+ rx_pkt->next = NULL;
+
+ nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+ /* Checking the checksum flag */
+ nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *rxb,
+ struct rte_mbuf *rx_pkt)
+{
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+ nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf **rxb,
+ struct rte_mbuf **rx_pkts)
+{
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+ nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+ nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+ __m256i data = _mm256_loadu_si256((void *)rxds);
+
+ if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+ (_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+ return false;
+
+ return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t avail;
+ uint16_t nb_hold;
+ bool burst_receive;
+ struct rte_mbuf **rxb;
+ struct nfp_net_rx_desc *rxds;
+ struct nfp_net_rxq *rxq = rx_queue;
+
+ if (unlikely(rxq == NULL)) {
+ PMD_RX_LOG(ERR, "RX Bad queue");
+ return 0;
+ }
+
+ avail = 0;
+ nb_hold = 0;
+ burst_receive = true;
+ while (avail < nb_pkts) {
+ rxds = &rxq->rxds[rxq->rd_p];
+ rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+ if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+ & PCIE_DESC_RX_DD) == 0)
+ goto recv_end;
+
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+ if ((rxq->rd_p & 0x3) == 0) {
+ rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+ rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+ }
+
+ if ((rxq->rd_p & 0x7) == 0) {
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+ rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+ }
+
+ /*
+ * If can not receive burst, just receive one.
+ * 1. Rx ring will coming to the tail.
+ * 2. Do not need to receive 4 packets.
+ * 3. If pointer address unaligned on 32-bit boundary.
+ * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+ */
+ if ((rxq->rx_count - rxq->rd_p) < 4 ||
+ (nb_pkts - avail) < 4 ||
+ ((uintptr_t)rxds & 0x1F) != 0 ||
+ !burst_receive) {
+ _mm_storel_epi64((void *)&rx_pkts[avail],
+ _mm_loadu_si128((void *)rxb));
+
+ /* Allocate a new mbuf into the software ring. */
+ if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+ PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+ rxq->port_id, rxq->qidx);
+ nfp_net_mbuf_alloc_failed(rxq);
+ goto recv_end;
+ }
+
+ nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+ avail++;
+ nb_hold++;
+ continue;
+ }
+
+ burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+ if (!burst_receive)
+ continue;
+
+ _mm256_storeu_si256((void *)&rx_pkts[avail],
+ _mm256_loadu_si256((void *)rxb));
+
+ /* Allocate 4 new mbufs into the software ring. */
+ if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+ burst_receive = false;
+ continue;
+ }
+
+ nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+ avail += 4;
+ nb_hold += 4;
+ }
+
+recv_end:
+ if (nb_hold == 0)
+ return nb_hold;
+
+ PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+ rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+ nb_hold += rxq->nb_rx_hold;
+
+ /*
+ * FL descriptors needs to be written before incrementing the
+ * FL queue WR pointer
+ */
+ rte_wmb();
+ if (nb_hold > rxq->rx_free_thresh) {
+ PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+ rxq->port_id, rxq->qidx, nb_hold, avail);
+ nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+ nb_hold = 0;
+ }
+ rxq->nb_rx_hold = nb_hold;
+
+ return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
#include <stdbool.h>
#include <rte_common.h>
+#include <rte_mbuf_core.h>
#include "nfp_rxtx_vec.h"
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
{
return false;
}
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+ __rte_unused struct rte_mbuf **rx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
` (3 preceding siblings ...)
2024-07-09 8:24 ` [PATCH v4 4/5] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-07-09 8:24 ` Chaoyong He
2024-07-09 13:06 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit
5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09 8:24 UTC (permalink / raw)
To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He
From: Long Wu <long.wu@corigine.com>
Vector AVX2 Rx function supports parsing packet type and set it to mbuf.
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
drivers/net/nfp/nfp_net_common.c | 2 +-
drivers/net/nfp/nfp_rxtx.c | 2 +-
drivers/net/nfp/nfp_rxtx.h | 3 +++
drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
4 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 08693d5fba..3d916cd147 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1455,7 +1455,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
RTE_PTYPE_INNER_L4_SCTP,
};
- if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+ if (dev->rx_pkt_burst == NULL)
return NULL;
net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
* @param mb
* Mbuf to set the packet type.
*/
-static void
+void
nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
struct nfp_net_rx_desc *rxds,
struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
uint16_t queue_id,
struct rte_eth_txq_info *qinfo);
void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+ struct nfp_net_rx_desc *rxds,
+ struct rte_mbuf *mb);
#endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+ nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
/* Checking the checksum flag */
nfp_net_rx_cksum(rxq, rxds, rx_pkt);
}
--
2.39.1
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v4 0/5] support AVX2 instruction Rx/Tx function
2024-07-09 8:24 ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
` (4 preceding siblings ...)
2024-07-09 8:24 ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-09 13:06 ` Ferruh Yigit
5 siblings, 0 replies; 26+ messages in thread
From: Ferruh Yigit @ 2024-07-09 13:06 UTC (permalink / raw)
To: Chaoyong He, dev; +Cc: oss-drivers
On 7/9/2024 9:24 AM, Chaoyong He wrote:
> This patch series add the support of Rx/Tx function using the
> AVX2 instruction.
>
> ---
> v4:
> * Add commit to solve the compile problem on 32-bit OS.
> v3:
> * Fix the 'meson.build' file to solve the compile problem.
> v2:
> * Rebase to the latest main branch.
> ---
>
> Long Wu (5):
> net/nfp: fix compile fail on 32-bit OS
> net/nfp: export more interfaces of NFDk
> net/nfp: support AVX2 Tx function
> net/nfp: support AVX2 Rx function
> net/nfp: vector Rx function supports parsing ptype
>
Series applied to dpdk-next-net/main, thanks.
^ permalink raw reply [flat|nested] 26+ messages in thread