DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 0/4] support AVX2 instruction Rx/Tx function
@ 2024-06-19  2:59 Chaoyong He
  2024-06-19  2:59 ` [PATCH 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
                   ` (5 more replies)
  0 siblings, 6 replies; 26+ messages in thread
From: Chaoyong He @ 2024-06-19  2:59 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Chaoyong He

This patch series add the support of Rx/Tx function using the
AVX2 instruction.

Long Wu (4):
  net/nfp: export more interfaces of NFDk
  net/nfp: support AVX2 Tx function
  net/nfp: support AVX2 Rx function
  net/nfp: vector Rx function supports parsing ptype

 drivers/net/nfp/meson.build                 |  15 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |  46 +++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  55 +--
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   5 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   5 +-
 drivers/net/nfp/nfp_net_common.c            |   2 +-
 drivers/net/nfp/nfp_net_meta.c              |   1 +
 drivers/net/nfp/nfp_rxtx.c                  |  12 +-
 drivers/net/nfp/nfp_rxtx.h                  |   9 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  17 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         | 275 +++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  25 ++
 15 files changed, 900 insertions(+), 49 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/4] net/nfp: export more interfaces of NFDk
  2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
@ 2024-06-19  2:59 ` Chaoyong He
  2024-06-19  2:59 ` [PATCH 2/4] net/nfp: support AVX2 Tx function Chaoyong He
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-06-19  2:59 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfdk/nfp_nfdk.h    | 45 ++++++++++++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
 #ifndef __NFP_NFDK_H__
 #define __NFP_NFDK_H__
 
+#include <nfp_platform.h>
+
 #include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
 
 #define NFDK_TX_DESC_PER_SIMPLE_PKT     2
 
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
 	return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
 }
 
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+	/* First descriptor fits less data, so adjust for that */
+	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+			NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+		struct rte_mbuf *mb,
+		uint64_t flags)
+{
+	uint64_t ol_flags;
+	struct nfp_net_hw *hw = txq->hw;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+		return flags;
+
+	ol_flags = mb->ol_flags;
+
+	/* Set TCP csum offload if TSO enabled. */
+	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+		flags |= NFDK_DESC_TX_ENCAP;
+
+	/* IPv6 does not need checksum */
+	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+		flags |= NFDK_DESC_TX_L3_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	return flags;
+}
+
 uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 		uint32_t port_id);
 uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 		const struct rte_eth_txconf *tx_conf);
 int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 		struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *metadata);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 3ba97e23a9..77021aa612 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
 #include "nfp_nfdk.h"
 
 #include <bus_pci_driver.h>
-#include <nfp_platform.h>
 #include <rte_malloc.h>
 
 #include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
-		struct rte_mbuf *mb,
-		uint64_t flags)
-{
-	uint64_t ol_flags;
-	struct nfp_net_hw *hw = txq->hw;
-
-	if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
-		return flags;
-
-	ol_flags = mb->ol_flags;
-
-	/* Set L4 csum offload if TSO/UFO enabled. */
-	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
-			(ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
-		flags |= NFDK_DESC_TX_ENCAP;
-
-	/* IPv6 does not need checksum */
-	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
-		flags |= NFDK_DESC_TX_L3_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	return flags;
-}
-
 /* Set TX descriptor for TSO of nfdk */
 static uint64_t
 nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 	return FLOWER_PKT_DATA_OFFSET;
 }
 
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
-	/* First descriptor fits less data, so adjust for that */
-	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
-			NFDK_TX_MAX_DATA_PER_DESC);
-}
-
 static inline void
 nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
 		uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 	return nop_slots;
 }
 
-static int
+int
 nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata)
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 2/4] net/nfp: support AVX2 Tx function
  2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
  2024-06-19  2:59 ` [PATCH 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-06-19  2:59 ` Chaoyong He
  2024-06-19  2:59 ` [PATCH 3/4] net/nfp: support AVX2 Rx function Chaoyong He
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-06-19  2:59 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/meson.build                 |  15 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |   1 +
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  12 +
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   3 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   3 +-
 drivers/net/nfp/nfp_rxtx.h                  |   5 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  13 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         |  21 +
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  16 +
 12 files changed, 568 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..eb54df5348 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
         'flower/nfp_flower_service.c',
         'nfd3/nfp_nfd3_dp.c',
         'nfdk/nfp_nfdk_dp.c',
+        'nfdk/nfp_nfdk_vec_stub.c',
         'nfpcore/nfp_cppcore.c',
         'nfpcore/nfp_crc.c',
         'nfpcore/nfp_elf.c',
@@ -43,7 +44,21 @@ sources = files(
         'nfp_net_flow.c',
         'nfp_net_meta.c',
         'nfp_rxtx.c',
+        'nfp_rxtx_vec_stub.c',
         'nfp_service.c',
 )
 
+if arch_subdir == 'x86'
+        avx2_sources = files(
+                'nfdk/nfp_nfdk_vec_avx2_dp.c',
+                'nfp_rxtx_vec_avx2.c',
+        )
+        nfp_avx2_lib = static_library('nfp_avx2_lib',
+                avx2_sources,
+                dependencies: [static_rte_ethdev, static_rte_bus_pci,
+                        static_rte_common_nfp],
+                c_args: [cflags, '-mavx2'])
+        objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
 deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 77021aa612..e6847ccb33 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
 #include "../flower/nfp_flower.h"
 #include "../nfp_logs.h"
 #include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
@@ -517,6 +519,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->tx_queues[queue_idx] = txq;
 	txq->hw = hw;
 	txq->hw_priv = dev->process_private;
+	txq->simple_always = true;
 
 	/*
 	 * Telling the HW about the physical address of the TX ring and number
@@ -527,3 +530,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 
 	return 0;
 }
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+		struct nfp_net_hw *hw)
+{
+	if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+		return false;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+		return true;
+
+	if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+		return true;
+
+	return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+		(NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *des_addr,
+		uint64_t *des_meta,
+		bool repr_flag)
+{
+	int ret;
+	__m128i dma_addr;
+	__m128i dma_hi;
+	__m128i data_off;
+	__m128i dlen_type;
+	uint64_t metadata;
+
+	if (repr_flag) {
+		metadata = NFDK_DESC_TX_CHAIN_META;
+	} else {
+		ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	data_off = _mm_set_epi64x(0, pkt->data_off);
+	dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+	dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+	dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+	*des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+			_mm_slli_epi64(dlen_type, 16)), 0);
+
+	*des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+	return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf *pkt,
+		bool repr_flag)
+{
+	int ret;
+	__m128i des_data;
+	uint64_t des_addr;
+	uint64_t des_meta;
+
+	ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+			&des_meta, repr_flag);
+	if (unlikely(ret != 0))
+		return ret;
+
+	txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+	if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+		txq->data_pending += pkt->data_len;
+	else
+		txq->data_pending = 0;
+
+	des_data = _mm_set_epi64x(des_meta, des_addr);
+
+	_mm_store_si128((void *)txds, des_data);
+
+	return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf **pkt,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t i;
+	__m256i des_data0_1;
+	__m256i des_data2_3;
+	uint64_t des_addr[4];
+	uint64_t des_meta[4];
+
+	for (i = 0; i < 4; i++) {
+		ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+				&des_addr[i], &des_meta[i], repr_flag);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	for (i = 0; i < 4; i++) {
+		txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+		if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+			txq->data_pending += pkt[i]->data_len;
+		else
+			txq->data_pending = 0;
+	}
+
+	des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+	des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+	_mm256_store_si256((void *)txds, des_data0_1);
+	_mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+	return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+		struct rte_mbuf **tx_pkts)
+{
+	__m256i mbuf_room0_1;
+	__m256i mbuf_room2_3;
+
+	mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+			(uintptr_t)tx_pkts[0]);
+	mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+			(uintptr_t)tx_pkts[2]);
+
+	_mm256_store_si256((void *)mbuf, mbuf_room0_1);
+	_mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts,
+		uint16_t simple_close,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t npkts = 0;
+	uint16_t need_txds;
+	uint16_t free_descs;
+	struct rte_mbuf **lmbuf;
+	struct nfp_net_nfdk_tx_desc *ktxds;
+
+	PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+			txq->qidx, txq->wr_p, nb_pkts);
+
+	need_txds = nb_pkts << 1;
+	if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+		nfp_net_tx_free_bufs(txq);
+
+	free_descs = nfp_net_nfdk_free_tx_desc(txq);
+	if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+		if (unlikely(simple_close > 0))
+			goto xmit_end;
+
+		return 0;
+	}
+
+	PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+	/* Sending packets */
+	while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+		ktxds = &txq->ktxds[txq->wr_p];
+		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+		/*
+		 * If can not send burst, just send one.
+		 * 1. Tx ring will come to the tail.
+		 * 2. Do not need to send 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. If free descriptors are not enough.
+		 */
+		if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+				(nb_pkts - npkts) < 4 ||
+				((uintptr_t)ktxds & 0x1F) != 0 ||
+				free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+			ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+					ktxds, tx_pkts[npkts], repr_flag);
+			if (unlikely(ret != 0))
+				goto xmit_end;
+
+			rte_pktmbuf_free(*lmbuf);
+
+			_mm_storel_epi64((void *)lmbuf,
+					_mm_loadu_si128((void *)&tx_pkts[npkts]));
+			npkts++;
+			free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+			continue;
+		}
+
+		ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+				&tx_pkts[npkts], repr_flag);
+		if (unlikely(ret != 0))
+			goto xmit_end;
+
+		rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+		nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+		npkts += 4;
+		free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+	}
+
+xmit_end:
+	/* Increment write pointers. Force memory write before we let HW know */
+	rte_wmb();
+	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+	return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t i;
+	uint16_t wr_p;
+	uint16_t nop_slots;
+	__m128i zero_128 = _mm_setzero_si128();
+	__m256i zero_256 = _mm256_setzero_si256();
+
+	wr_p = txq->wr_p;
+	nop_slots = D_BLOCK_CPL(wr_p);
+
+	for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+		_mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+		_mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+	}
+
+	for (; i >= 2; i -= 2, wr_p += 2) {
+		_mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+		_mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	for (; i >= 1; i--, wr_p++) {
+		_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+		_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	txq->data_pending = 0;
+	txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+	(*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t wr_p;
+	__m128i zero_128 = _mm_setzero_si128();
+
+	wr_p = txq->wr_p;
+
+	_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+	rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+	_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+	txq->wr_p = D_IDX(txq, wr_p + 1);
+	(*simple_close)++;
+
+	return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+		struct rte_mbuf *pkt,
+		bool *simple_flag,
+		bool *pending_flag,
+		uint16_t *data_pending,
+		uint32_t *wr_p,
+		uint16_t *simple_close)
+{
+	uint32_t data_pending_temp;
+
+	/* Let the first descriptor index even before send simple packets */
+	if (!(*simple_flag)) {
+		if ((*wr_p & 0x1) == 0x1)
+			*wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+		*simple_flag = true;
+	}
+
+	/* Simple packets only need one close block operation */
+	if (!(*pending_flag)) {
+		if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+			*pending_flag = true;
+			return;
+		}
+
+		data_pending_temp = *data_pending + pkt->data_len;
+		if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+			nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+			*pending_flag = true;
+			return;
+		}
+
+		*data_pending = data_pending_temp;
+
+		*wr_p += 2;
+	}
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts,
+		uint16_t *simple_close)
+{
+	uint32_t wr_p;
+	uint16_t simple_idx;
+	struct rte_mbuf *pkt;
+	uint16_t data_pending;
+	bool simple_flag = false;
+	bool pending_flag = false;
+	uint16_t simple_count = 0;
+
+	*simple_close = 0;
+	wr_p = txq->wr_p;
+	data_pending = txq->data_pending;
+
+	for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+		pkt = tx_pkts[simple_idx];
+		if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		simple_count++;
+		if (!txq->simple_always)
+			nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+					&pending_flag, &data_pending, &wr_p, simple_close);
+	}
+
+	return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts)
+{
+	uint16_t others_idx;
+	struct rte_mbuf *pkt;
+	uint16_t others_count = 0;
+
+	for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+		pkt = tx_pkts[others_idx];
+		if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		others_count++;
+	}
+
+	return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t i;
+	uint16_t avail = 0;
+	uint16_t simple_close;
+	uint16_t simple_count;
+	uint16_t simple_avail;
+	uint16_t others_count;
+	uint16_t others_avail;
+	struct nfp_net_txq *txq = tx_queue;
+
+	for (i = 0; i < nb_pkts; i++) {
+		simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+				nb_pkts, &simple_close);
+		if (simple_count > 0) {
+			if (!txq->simple_always)
+				txq->simple_always = true;
+
+			simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+					tx_pkts + i, simple_count, simple_close,
+					false);
+
+			avail += simple_avail;
+			if (simple_avail != simple_count)
+				break;
+
+			i += simple_count;
+		}
+
+		if (i == nb_pkts)
+			break;
+
+		others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+				i, nb_pkts);
+
+		if (txq->simple_always)
+			txq->simple_always = false;
+
+		others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+				tx_pkts + i, others_count, false);
+
+		avail += others_avail;
+		if (others_avail != others_count)
+			break;
+
+		i += others_count;
+	}
+
+	return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+		__rte_unused struct rte_mbuf **tx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 004c725ef8..acf9a73690 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -27,6 +27,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
 
 /* 64-bit per app capabilities */
 #define NFP_NET_APP_CAP_SP_INDIFF       RTE_BIT64(0) /* Indifferent to port speed */
@@ -887,7 +888,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index a422bcd057..63ea0a5d17 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
 
 #include "nfp_logs.h"
 #include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
 
 #define NFP_VF_DRIVER_NAME net_nfp_vf
 
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
 	/** Used by NFDk only */
 	uint16_t data_pending;
 
+	/** Used by NFDk vector xmit only */
+	bool simple_always;
+
 	/**
 	 * At this point 58 bytes have been used for all the fields in the
-	 * TX critical path. We have room for 6 bytes and still all placed
+	 * TX critical path. We have room for 5 bytes and still all placed
 	 * in a cache line.
 	 */
 	uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+		return true;
+
+	return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+	return false;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 3/4] net/nfp: support AVX2 Rx function
  2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
  2024-06-19  2:59 ` [PATCH 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
  2024-06-19  2:59 ` [PATCH 2/4] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-06-19  2:59 ` Chaoyong He
  2024-06-19  2:59 ` [PATCH 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-06-19  2:59 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.

Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_ethdev.c        |   2 +-
 drivers/net/nfp/nfp_ethdev_vf.c     |   2 +-
 drivers/net/nfp/nfp_net_meta.c      |   1 +
 drivers/net/nfp/nfp_rxtx.c          |  10 ++
 drivers/net/nfp/nfp_rxtx.h          |   1 +
 drivers/net/nfp/nfp_rxtx_vec.h      |   4 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c |   9 +
 8 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index acf9a73690..71c4f35c56 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -892,7 +892,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index 63ea0a5d17..a5c600c87b 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
 		rte_be32_t meta_header,
 		struct nfp_net_meta_parsed *meta)
 {
+	meta->flags = 0;
 	meta->flags |= (1 << NFP_NET_META_HASH);
 	meta->hash_type = rte_be_to_cpu_32(meta_header);
 	meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
 
 /*
  * The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 	info->conf.offloads = dev_info.tx_offload_capa &
 			dev->data->dev_conf.txmode.offloads;
 }
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+	else
+		eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
 void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
 
 bool nfp_net_get_avx2_supported(void);
 
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
 
 #include <stdbool.h>
 
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
 #include <rte_cpuflags.h>
 #include <rte_vect.h>
 
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
 #include "nfp_rxtx_vec.h"
 
 bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
 
 	return false;
 }
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	_mm_storel_epi64((void *)rxds, vaddr0);
+
+	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i vaddr1;
+	__m128i vaddr2;
+	__m128i vaddr3;
+	__m128i vaddr0_1;
+	__m128i vaddr2_3;
+	__m256i vaddr0_3;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+			vaddr2_3, 1);
+
+	_mm256_store_si256((void *)rxds, vaddr0_3);
+
+	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rx_pkt)
+{
+	struct nfp_net_hw *hw = rxq->hw;
+	struct nfp_net_meta_parsed meta;
+
+	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+	/* Size of the whole packet. We just support 1 segment */
+	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+	/* Filling the received mbuf with packet info */
+	if (hw->rx_offset)
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+	else
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+	rx_pkt->port = rxq->port_id;
+	rx_pkt->nb_segs = 1;
+	rx_pkt->next = NULL;
+
+	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+	/* Checking the checksum flag */
+	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb,
+		struct rte_mbuf *rx_pkt)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+	nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb,
+		struct rte_mbuf **rx_pkts)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+	__m256i data = _mm256_loadu_si256((void *)rxds);
+
+	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+		return false;
+
+	return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t avail;
+	uint16_t nb_hold;
+	bool burst_receive;
+	struct rte_mbuf **rxb;
+	struct nfp_net_rx_desc *rxds;
+	struct nfp_net_rxq *rxq = rx_queue;
+
+	if (unlikely(rxq == NULL)) {
+		PMD_RX_LOG(ERR, "RX Bad queue");
+		return 0;
+	}
+
+	avail = 0;
+	nb_hold = 0;
+	burst_receive = true;
+	while (avail < nb_pkts) {
+		rxds = &rxq->rxds[rxq->rd_p];
+		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+				& PCIE_DESC_RX_DD) == 0)
+			goto recv_end;
+
+		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+		if ((rxq->rd_p & 0x3) == 0) {
+			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+		}
+
+		if ((rxq->rd_p & 0x7) == 0) {
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+		}
+
+		/*
+		 * If can not receive burst, just receive one.
+		 * 1. Rx ring will coming to the tail.
+		 * 2. Do not need to receive 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+		 */
+		if ((rxq->rx_count - rxq->rd_p) < 4 ||
+				(nb_pkts - avail) < 4 ||
+				((uintptr_t)rxds & 0x1F) != 0 ||
+				!burst_receive) {
+			_mm_storel_epi64((void *)&rx_pkts[avail],
+					_mm_loadu_si128((void *)rxb));
+
+			/* Allocate a new mbuf into the software ring. */
+			if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+				PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+						rxq->port_id, rxq->qidx);
+				nfp_net_mbuf_alloc_failed(rxq);
+				goto recv_end;
+			}
+
+			nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+			avail++;
+			nb_hold++;
+			continue;
+		}
+
+		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+		if (!burst_receive)
+			continue;
+
+		_mm256_storeu_si256((void *)&rx_pkts[avail],
+				_mm256_loadu_si256((void *)rxb));
+
+		/* Allocate 4 new mbufs into the software ring. */
+		if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+			burst_receive = false;
+			continue;
+		}
+
+		nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+		avail += 4;
+		nb_hold += 4;
+	}
+
+recv_end:
+	if (nb_hold == 0)
+		return nb_hold;
+
+	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+	nb_hold += rxq->nb_rx_hold;
+
+	/*
+	 * FL descriptors needs to be written before incrementing the
+	 * FL queue WR pointer
+	 */
+	rte_wmb();
+	if (nb_hold > rxq->rx_free_thresh) {
+		PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+				rxq->port_id, rxq->qidx, nb_hold, avail);
+		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+		nb_hold = 0;
+	}
+	rxq->nb_rx_hold = nb_hold;
+
+	return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 
 #include <rte_common.h>
+#include <rte_mbuf_core.h>
 
 #include "nfp_rxtx_vec.h"
 
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
 {
 	return false;
 }
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+		__rte_unused struct rte_mbuf **rx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 4/4] net/nfp: vector Rx function supports parsing ptype
  2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
                   ` (2 preceding siblings ...)
  2024-06-19  2:59 ` [PATCH 3/4] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-06-19  2:59 ` Chaoyong He
  2024-07-06 18:51 ` [PATCH 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-06-19  2:59 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Vector AVX2 Rx function supports parsing packet type and set it to mbuf.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_net_common.c    | 2 +-
 drivers/net/nfp/nfp_rxtx.c          | 2 +-
 drivers/net/nfp/nfp_rxtx.h          | 3 +++
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
 4 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 260920ecff..107306a4e3 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1409,7 +1409,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
 		RTE_PTYPE_INNER_L4_SCTP,
 	};
 
-	if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+	if (dev->rx_pkt_burst == NULL)
 		return NULL;
 
 	net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
  * @param mb
  *   Mbuf to set the packet type.
  */
-static void
+void
 nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
 		struct nfp_net_rx_desc *rxds,
 		struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
 void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *mb);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
 
 	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
 
+	nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
 	/* Checking the checksum flag */
 	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
 }
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] support AVX2 instruction Rx/Tx function
  2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
                   ` (3 preceding siblings ...)
  2024-06-19  2:59 ` [PATCH 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-06 18:51 ` Ferruh Yigit
  2024-07-08  5:52   ` Chaoyong He
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
  5 siblings, 1 reply; 26+ messages in thread
From: Ferruh Yigit @ 2024-07-06 18:51 UTC (permalink / raw)
  To: Chaoyong He, dev; +Cc: oss-drivers

On 6/19/2024 3:59 AM, Chaoyong He wrote:
> This patch series add the support of Rx/Tx function using the
> AVX2 instruction.
> 
> Long Wu (4):
>   net/nfp: export more interfaces of NFDk
>   net/nfp: support AVX2 Tx function
>   net/nfp: support AVX2 Rx function
>   net/nfp: vector Rx function supports parsing ptype
>

I got some build errors [1], and I can see CI did not run because of
apply error.
Can you please try with local github actions and send a new version?

[1]
- ubuntu-22.04-gcc-static-i386:
Run-time dependency netcope-common found: NO (tried pkgconfig)

drivers/net/nfp/meson.build:56:8: ERROR: Unknown variable
"static_rte_common_nfp".

- ubuntu-22.04-gcc-static-mingw:
drivers/net/nfp/meson.build:56:8: ERROR: Unknown variable
"static_rte_common_nfp".

A full log can be found at
/home/runner/work/dpdk/dpdk/build/meson-logs/meson-log.txt

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 0/4] support AVX2 instruction Rx/Tx function
  2024-07-06 18:51 ` [PATCH 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
@ 2024-07-08  5:52   ` Chaoyong He
  0 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08  5:52 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: oss-drivers

> On 6/19/2024 3:59 AM, Chaoyong He wrote:
> > This patch series add the support of Rx/Tx function using the
> > AVX2 instruction.
> >
> > Long Wu (4):
> >   net/nfp: export more interfaces of NFDk
> >   net/nfp: support AVX2 Tx function
> >   net/nfp: support AVX2 Rx function
> >   net/nfp: vector Rx function supports parsing ptype
> >
> 
> I got some build errors [1], and I can see CI did not run because of apply error.
> Can you please try with local github actions and send a new version?
> 
> [1]
> - ubuntu-22.04-gcc-static-i386:
> Run-time dependency netcope-common found: NO (tried pkgconfig)
> 
> drivers/net/nfp/meson.build:56:8: ERROR: Unknown variable
> "static_rte_common_nfp".
> 
> - ubuntu-22.04-gcc-static-mingw:
> drivers/net/nfp/meson.build:56:8: ERROR: Unknown variable
> "static_rte_common_nfp".
> 
> A full log can be found at
> /home/runner/work/dpdk/dpdk/build/meson-logs/meson-log.txt

Seems this might because you don't compile the dependencies 'driver/common/nfp'?
Anyway, I will send a new version patch series to trigger the CI.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 0/4] support AVX2 instruction Rx/Tx function
  2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
                   ` (4 preceding siblings ...)
  2024-07-06 18:51 ` [PATCH 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
@ 2024-07-08  5:58 ` Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
                     ` (5 more replies)
  5 siblings, 6 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08  5:58 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Chaoyong He

This patch series add the support of Rx/Tx function using the
AVX2 instruction.

---
v2: rebase to the latest main branch.
---

Long Wu (4):
  net/nfp: export more interfaces of NFDk
  net/nfp: support AVX2 Tx function
  net/nfp: support AVX2 Rx function
  net/nfp: vector Rx function supports parsing ptype

 drivers/net/nfp/meson.build                 |  15 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |  46 +++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  55 +--
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   5 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   5 +-
 drivers/net/nfp/nfp_net_common.c            |   2 +-
 drivers/net/nfp/nfp_net_meta.c              |   1 +
 drivers/net/nfp/nfp_rxtx.c                  |  12 +-
 drivers/net/nfp/nfp_rxtx.h                  |   9 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  17 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         | 275 +++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  25 ++
 15 files changed, 900 insertions(+), 49 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/4] net/nfp: export more interfaces of NFDk
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
@ 2024-07-08  5:58   ` Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08  5:58 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfdk/nfp_nfdk.h    | 45 ++++++++++++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
 #ifndef __NFP_NFDK_H__
 #define __NFP_NFDK_H__
 
+#include <nfp_platform.h>
+
 #include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
 
 #define NFDK_TX_DESC_PER_SIMPLE_PKT     2
 
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
 	return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
 }
 
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+	/* First descriptor fits less data, so adjust for that */
+	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+			NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+		struct rte_mbuf *mb,
+		uint64_t flags)
+{
+	uint64_t ol_flags;
+	struct nfp_net_hw *hw = txq->hw;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+		return flags;
+
+	ol_flags = mb->ol_flags;
+
+	/* Set TCP csum offload if TSO enabled. */
+	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+		flags |= NFDK_DESC_TX_ENCAP;
+
+	/* IPv6 does not need checksum */
+	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+		flags |= NFDK_DESC_TX_L3_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	return flags;
+}
+
 uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 		uint32_t port_id);
 uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 		const struct rte_eth_txconf *tx_conf);
 int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 		struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *metadata);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 8b8c191b82..173aabf0b9 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
 #include "nfp_nfdk.h"
 
 #include <bus_pci_driver.h>
-#include <nfp_platform.h>
 #include <rte_malloc.h>
 
 #include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
-		struct rte_mbuf *mb,
-		uint64_t flags)
-{
-	uint64_t ol_flags;
-	struct nfp_net_hw *hw = txq->hw;
-
-	if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
-		return flags;
-
-	ol_flags = mb->ol_flags;
-
-	/* Set L4 csum offload if TSO/UFO enabled. */
-	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
-			(ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
-		flags |= NFDK_DESC_TX_ENCAP;
-
-	/* IPv6 does not need checksum */
-	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
-		flags |= NFDK_DESC_TX_L3_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	return flags;
-}
-
 /* Set TX descriptor for TSO of nfdk */
 static uint64_t
 nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 	return FLOWER_PKT_DATA_OFFSET;
 }
 
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
-	/* First descriptor fits less data, so adjust for that */
-	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
-			NFDK_TX_MAX_DATA_PER_DESC);
-}
-
 static inline void
 nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
 		uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 	return nop_slots;
 }
 
-static int
+int
 nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata)
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 2/4] net/nfp: support AVX2 Tx function
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-07-08  5:58   ` Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08  5:58 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/meson.build                 |  15 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |   1 +
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  12 +
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   3 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   3 +-
 drivers/net/nfp/nfp_rxtx.h                  |   5 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  13 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         |  21 +
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  16 +
 12 files changed, 568 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index b7e5beffb0..39bda04bc5 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
         'flower/nfp_flower_service.c',
         'nfd3/nfp_nfd3_dp.c',
         'nfdk/nfp_nfdk_dp.c',
+        'nfdk/nfp_nfdk_vec_stub.c',
         'nfpcore/nfp_cppcore.c',
         'nfpcore/nfp_crc.c',
         'nfpcore/nfp_elf.c',
@@ -43,8 +44,22 @@ sources = files(
         'nfp_net_flow.c',
         'nfp_net_meta.c',
         'nfp_rxtx.c',
+        'nfp_rxtx_vec_stub.c',
         'nfp_service.c',
         'nfp_trace.c',
 )
 
+if arch_subdir == 'x86'
+        avx2_sources = files(
+                'nfdk/nfp_nfdk_vec_avx2_dp.c',
+                'nfp_rxtx_vec_avx2.c',
+        )
+        nfp_avx2_lib = static_library('nfp_avx2_lib',
+                avx2_sources,
+                dependencies: [static_rte_ethdev, static_rte_bus_pci,
+                        static_rte_common_nfp],
+                c_args: [cflags, '-mavx2'])
+        objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
 deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
 #include "../flower/nfp_flower.h"
 #include "../nfp_logs.h"
 #include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->tx_queues[queue_idx] = txq;
 	txq->hw = hw;
 	txq->hw_priv = dev->process_private;
+	txq->simple_always = true;
 
 	/*
 	 * Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 
 	return 0;
 }
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+		struct nfp_net_hw *hw)
+{
+	if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+		return false;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+		return true;
+
+	if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+		return true;
+
+	return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+		(NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *des_addr,
+		uint64_t *des_meta,
+		bool repr_flag)
+{
+	int ret;
+	__m128i dma_addr;
+	__m128i dma_hi;
+	__m128i data_off;
+	__m128i dlen_type;
+	uint64_t metadata;
+
+	if (repr_flag) {
+		metadata = NFDK_DESC_TX_CHAIN_META;
+	} else {
+		ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	data_off = _mm_set_epi64x(0, pkt->data_off);
+	dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+	dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+	dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+	*des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+			_mm_slli_epi64(dlen_type, 16)), 0);
+
+	*des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+	return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf *pkt,
+		bool repr_flag)
+{
+	int ret;
+	__m128i des_data;
+	uint64_t des_addr;
+	uint64_t des_meta;
+
+	ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+			&des_meta, repr_flag);
+	if (unlikely(ret != 0))
+		return ret;
+
+	txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+	if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+		txq->data_pending += pkt->data_len;
+	else
+		txq->data_pending = 0;
+
+	des_data = _mm_set_epi64x(des_meta, des_addr);
+
+	_mm_store_si128((void *)txds, des_data);
+
+	return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf **pkt,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t i;
+	__m256i des_data0_1;
+	__m256i des_data2_3;
+	uint64_t des_addr[4];
+	uint64_t des_meta[4];
+
+	for (i = 0; i < 4; i++) {
+		ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+				&des_addr[i], &des_meta[i], repr_flag);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	for (i = 0; i < 4; i++) {
+		txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+		if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+			txq->data_pending += pkt[i]->data_len;
+		else
+			txq->data_pending = 0;
+	}
+
+	des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+	des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+	_mm256_store_si256((void *)txds, des_data0_1);
+	_mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+	return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+		struct rte_mbuf **tx_pkts)
+{
+	__m256i mbuf_room0_1;
+	__m256i mbuf_room2_3;
+
+	mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+			(uintptr_t)tx_pkts[0]);
+	mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+			(uintptr_t)tx_pkts[2]);
+
+	_mm256_store_si256((void *)mbuf, mbuf_room0_1);
+	_mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts,
+		uint16_t simple_close,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t npkts = 0;
+	uint16_t need_txds;
+	uint16_t free_descs;
+	struct rte_mbuf **lmbuf;
+	struct nfp_net_nfdk_tx_desc *ktxds;
+
+	PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+			txq->qidx, txq->wr_p, nb_pkts);
+
+	need_txds = nb_pkts << 1;
+	if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+		nfp_net_tx_free_bufs(txq);
+
+	free_descs = nfp_net_nfdk_free_tx_desc(txq);
+	if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+		if (unlikely(simple_close > 0))
+			goto xmit_end;
+
+		return 0;
+	}
+
+	PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+	/* Sending packets */
+	while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+		ktxds = &txq->ktxds[txq->wr_p];
+		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+		/*
+		 * If can not send burst, just send one.
+		 * 1. Tx ring will come to the tail.
+		 * 2. Do not need to send 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. If free descriptors are not enough.
+		 */
+		if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+				(nb_pkts - npkts) < 4 ||
+				((uintptr_t)ktxds & 0x1F) != 0 ||
+				free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+			ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+					ktxds, tx_pkts[npkts], repr_flag);
+			if (unlikely(ret != 0))
+				goto xmit_end;
+
+			rte_pktmbuf_free(*lmbuf);
+
+			_mm_storel_epi64((void *)lmbuf,
+					_mm_loadu_si128((void *)&tx_pkts[npkts]));
+			npkts++;
+			free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+			continue;
+		}
+
+		ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+				&tx_pkts[npkts], repr_flag);
+		if (unlikely(ret != 0))
+			goto xmit_end;
+
+		rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+		nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+		npkts += 4;
+		free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+	}
+
+xmit_end:
+	/* Increment write pointers. Force memory write before we let HW know */
+	rte_wmb();
+	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+	return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t i;
+	uint16_t wr_p;
+	uint16_t nop_slots;
+	__m128i zero_128 = _mm_setzero_si128();
+	__m256i zero_256 = _mm256_setzero_si256();
+
+	wr_p = txq->wr_p;
+	nop_slots = D_BLOCK_CPL(wr_p);
+
+	for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+		_mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+		_mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+	}
+
+	for (; i >= 2; i -= 2, wr_p += 2) {
+		_mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+		_mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	for (; i >= 1; i--, wr_p++) {
+		_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+		_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	txq->data_pending = 0;
+	txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+	(*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t wr_p;
+	__m128i zero_128 = _mm_setzero_si128();
+
+	wr_p = txq->wr_p;
+
+	_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+	rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+	_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+	txq->wr_p = D_IDX(txq, wr_p + 1);
+	(*simple_close)++;
+
+	return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+		struct rte_mbuf *pkt,
+		bool *simple_flag,
+		bool *pending_flag,
+		uint16_t *data_pending,
+		uint32_t *wr_p,
+		uint16_t *simple_close)
+{
+	uint32_t data_pending_temp;
+
+	/* Let the first descriptor index even before send simple packets */
+	if (!(*simple_flag)) {
+		if ((*wr_p & 0x1) == 0x1)
+			*wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+		*simple_flag = true;
+	}
+
+	/* Simple packets only need one close block operation */
+	if (!(*pending_flag)) {
+		if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+			*pending_flag = true;
+			return;
+		}
+
+		data_pending_temp = *data_pending + pkt->data_len;
+		if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+			nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+			*pending_flag = true;
+			return;
+		}
+
+		*data_pending = data_pending_temp;
+
+		*wr_p += 2;
+	}
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts,
+		uint16_t *simple_close)
+{
+	uint32_t wr_p;
+	uint16_t simple_idx;
+	struct rte_mbuf *pkt;
+	uint16_t data_pending;
+	bool simple_flag = false;
+	bool pending_flag = false;
+	uint16_t simple_count = 0;
+
+	*simple_close = 0;
+	wr_p = txq->wr_p;
+	data_pending = txq->data_pending;
+
+	for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+		pkt = tx_pkts[simple_idx];
+		if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		simple_count++;
+		if (!txq->simple_always)
+			nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+					&pending_flag, &data_pending, &wr_p, simple_close);
+	}
+
+	return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts)
+{
+	uint16_t others_idx;
+	struct rte_mbuf *pkt;
+	uint16_t others_count = 0;
+
+	for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+		pkt = tx_pkts[others_idx];
+		if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		others_count++;
+	}
+
+	return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t i;
+	uint16_t avail = 0;
+	uint16_t simple_close;
+	uint16_t simple_count;
+	uint16_t simple_avail;
+	uint16_t others_count;
+	uint16_t others_avail;
+	struct nfp_net_txq *txq = tx_queue;
+
+	for (i = 0; i < nb_pkts; i++) {
+		simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+				nb_pkts, &simple_close);
+		if (simple_count > 0) {
+			if (!txq->simple_always)
+				txq->simple_always = true;
+
+			simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+					tx_pkts + i, simple_count, simple_close,
+					false);
+
+			avail += simple_avail;
+			if (simple_avail != simple_count)
+				break;
+
+			i += simple_count;
+		}
+
+		if (i == nb_pkts)
+			break;
+
+		others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+				i, nb_pkts);
+
+		if (txq->simple_always)
+			txq->simple_always = false;
+
+		others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+				tx_pkts + i, others_count, false);
+
+		avail += others_avail;
+		if (others_avail != others_count)
+			break;
+
+		i += others_count;
+	}
+
+	return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+		__rte_unused struct rte_mbuf **tx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
 
 /* 64-bit per app capabilities */
 #define NFP_NET_APP_CAP_SP_INDIFF       RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
 
 #include "nfp_logs.h"
 #include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
 
 #define NFP_VF_DRIVER_NAME net_nfp_vf
 
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
 	/** Used by NFDk only */
 	uint16_t data_pending;
 
+	/** Used by NFDk vector xmit only */
+	bool simple_always;
+
 	/**
 	 * At this point 58 bytes have been used for all the fields in the
-	 * TX critical path. We have room for 6 bytes and still all placed
+	 * TX critical path. We have room for 5 bytes and still all placed
 	 * in a cache line.
 	 */
 	uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+		return true;
+
+	return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+	return false;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 3/4] net/nfp: support AVX2 Rx function
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-07-08  5:58   ` Chaoyong He
  2024-07-08  5:58   ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08  5:58 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.

Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_ethdev.c        |   2 +-
 drivers/net/nfp/nfp_ethdev_vf.c     |   2 +-
 drivers/net/nfp/nfp_net_meta.c      |   1 +
 drivers/net/nfp/nfp_rxtx.c          |  10 ++
 drivers/net/nfp/nfp_rxtx.h          |   1 +
 drivers/net/nfp/nfp_rxtx_vec.h      |   4 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c |   9 +
 8 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
 		rte_be32_t meta_header,
 		struct nfp_net_meta_parsed *meta)
 {
+	meta->flags = 0;
 	meta->flags |= (1 << NFP_NET_META_HASH);
 	meta->hash_type = rte_be_to_cpu_32(meta_header);
 	meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
 
 /*
  * The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 	info->conf.offloads = dev_info.tx_offload_capa &
 			dev->data->dev_conf.txmode.offloads;
 }
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+	else
+		eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
 void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
 
 bool nfp_net_get_avx2_supported(void);
 
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
 
 #include <stdbool.h>
 
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
 #include <rte_cpuflags.h>
 #include <rte_vect.h>
 
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
 #include "nfp_rxtx_vec.h"
 
 bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
 
 	return false;
 }
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	_mm_storel_epi64((void *)rxds, vaddr0);
+
+	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i vaddr1;
+	__m128i vaddr2;
+	__m128i vaddr3;
+	__m128i vaddr0_1;
+	__m128i vaddr2_3;
+	__m256i vaddr0_3;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+			vaddr2_3, 1);
+
+	_mm256_store_si256((void *)rxds, vaddr0_3);
+
+	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rx_pkt)
+{
+	struct nfp_net_hw *hw = rxq->hw;
+	struct nfp_net_meta_parsed meta;
+
+	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+	/* Size of the whole packet. We just support 1 segment */
+	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+	/* Filling the received mbuf with packet info */
+	if (hw->rx_offset)
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+	else
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+	rx_pkt->port = rxq->port_id;
+	rx_pkt->nb_segs = 1;
+	rx_pkt->next = NULL;
+
+	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+	/* Checking the checksum flag */
+	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb,
+		struct rte_mbuf *rx_pkt)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+	nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb,
+		struct rte_mbuf **rx_pkts)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+	__m256i data = _mm256_loadu_si256((void *)rxds);
+
+	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+		return false;
+
+	return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t avail;
+	uint16_t nb_hold;
+	bool burst_receive;
+	struct rte_mbuf **rxb;
+	struct nfp_net_rx_desc *rxds;
+	struct nfp_net_rxq *rxq = rx_queue;
+
+	if (unlikely(rxq == NULL)) {
+		PMD_RX_LOG(ERR, "RX Bad queue");
+		return 0;
+	}
+
+	avail = 0;
+	nb_hold = 0;
+	burst_receive = true;
+	while (avail < nb_pkts) {
+		rxds = &rxq->rxds[rxq->rd_p];
+		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+				& PCIE_DESC_RX_DD) == 0)
+			goto recv_end;
+
+		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+		if ((rxq->rd_p & 0x3) == 0) {
+			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+		}
+
+		if ((rxq->rd_p & 0x7) == 0) {
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+		}
+
+		/*
+		 * If can not receive burst, just receive one.
+		 * 1. Rx ring will coming to the tail.
+		 * 2. Do not need to receive 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+		 */
+		if ((rxq->rx_count - rxq->rd_p) < 4 ||
+				(nb_pkts - avail) < 4 ||
+				((uintptr_t)rxds & 0x1F) != 0 ||
+				!burst_receive) {
+			_mm_storel_epi64((void *)&rx_pkts[avail],
+					_mm_loadu_si128((void *)rxb));
+
+			/* Allocate a new mbuf into the software ring. */
+			if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+				PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+						rxq->port_id, rxq->qidx);
+				nfp_net_mbuf_alloc_failed(rxq);
+				goto recv_end;
+			}
+
+			nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+			avail++;
+			nb_hold++;
+			continue;
+		}
+
+		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+		if (!burst_receive)
+			continue;
+
+		_mm256_storeu_si256((void *)&rx_pkts[avail],
+				_mm256_loadu_si256((void *)rxb));
+
+		/* Allocate 4 new mbufs into the software ring. */
+		if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+			burst_receive = false;
+			continue;
+		}
+
+		nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+		avail += 4;
+		nb_hold += 4;
+	}
+
+recv_end:
+	if (nb_hold == 0)
+		return nb_hold;
+
+	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+	nb_hold += rxq->nb_rx_hold;
+
+	/*
+	 * FL descriptors needs to be written before incrementing the
+	 * FL queue WR pointer
+	 */
+	rte_wmb();
+	if (nb_hold > rxq->rx_free_thresh) {
+		PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+				rxq->port_id, rxq->qidx, nb_hold, avail);
+		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+		nb_hold = 0;
+	}
+	rxq->nb_rx_hold = nb_hold;
+
+	return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 
 #include <rte_common.h>
+#include <rte_mbuf_core.h>
 
 #include "nfp_rxtx_vec.h"
 
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
 {
 	return false;
 }
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+		__rte_unused struct rte_mbuf **rx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
                     ` (2 preceding siblings ...)
  2024-07-08  5:58   ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-07-08  5:58   ` Chaoyong He
  2024-07-08 11:45   ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-08  5:58 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Vector AVX2 Rx function supports parsing packet type and set it to mbuf.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_net_common.c    | 2 +-
 drivers/net/nfp/nfp_rxtx.c          | 2 +-
 drivers/net/nfp/nfp_rxtx.h          | 3 +++
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
 4 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 08693d5fba..3d916cd147 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1455,7 +1455,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
 		RTE_PTYPE_INNER_L4_SCTP,
 	};
 
-	if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+	if (dev->rx_pkt_burst == NULL)
 		return NULL;
 
 	net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
  * @param mb
  *   Mbuf to set the packet type.
  */
-static void
+void
 nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
 		struct nfp_net_rx_desc *rxds,
 		struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
 void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *mb);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
 
 	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
 
+	nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
 	/* Checking the checksum flag */
 	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
 }
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 0/4] support AVX2 instruction Rx/Tx function
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
                     ` (3 preceding siblings ...)
  2024-07-08  5:58   ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-08 11:45   ` Ferruh Yigit
  2024-07-09  1:13     ` Chaoyong He
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
  5 siblings, 1 reply; 26+ messages in thread
From: Ferruh Yigit @ 2024-07-08 11:45 UTC (permalink / raw)
  To: Chaoyong He, dev; +Cc: oss-drivers

On 7/8/2024 6:58 AM, Chaoyong He wrote:
> This patch series add the support of Rx/Tx function using the
> AVX2 instruction.
> 
> ---
> v2: rebase to the latest main branch.
> ---
> 
> Long Wu (4):
>   net/nfp: export more interfaces of NFDk
>   net/nfp: support AVX2 Tx function
>   net/nfp: support AVX2 Rx function
>   net/nfp: vector Rx function supports parsing ptype
>

I can see still fails to apply to main, I guess because of the trace
patchset, let me go one by one, first I will get trace one and later
will ask for a new version of this set again with a rebase.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v2 0/4] support AVX2 instruction Rx/Tx function
  2024-07-08 11:45   ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
@ 2024-07-09  1:13     ` Chaoyong He
  0 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  1:13 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: oss-drivers

> On 7/8/2024 6:58 AM, Chaoyong He wrote:
> > This patch series add the support of Rx/Tx function using the
> > AVX2 instruction.
> >
> > ---
> > v2: rebase to the latest main branch.
> > ---
> >
> > Long Wu (4):
> >   net/nfp: export more interfaces of NFDk
> >   net/nfp: support AVX2 Tx function
> >   net/nfp: support AVX2 Rx function
> >   net/nfp: vector Rx function supports parsing ptype
> >
> 
> I can see still fails to apply to main, I guess because of the trace patchset, let
> me go one by one, first I will get trace one and later will ask for a new version
> of this set again with a rebase.

We have also found the 'static_rte_common_nfp' problem you mentioned in the previous version patch series in our local CI environment.
We will make sure it be solved before we send out a new version patch series.
Thanks.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 0/4] support AVX2 instruction Rx/Tx function
  2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
                     ` (4 preceding siblings ...)
  2024-07-08 11:45   ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
@ 2024-07-09  7:29   ` Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
                       ` (4 more replies)
  5 siblings, 5 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  7:29 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Chaoyong He

This patch series add the support of Rx/Tx function using the
AVX2 instruction.

---
v3:
* Fix the 'meson.build' file to solve the compile problem.
v2:
* Rebase to the latest main branch.
---

Long Wu (4):
  net/nfp: export more interfaces of NFDk
  net/nfp: support AVX2 Tx function
  net/nfp: support AVX2 Rx function
  net/nfp: vector Rx function supports parsing ptype

 drivers/net/nfp/meson.build                 |  20 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |  46 +++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  55 +--
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   5 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   5 +-
 drivers/net/nfp/nfp_net_common.c            |   2 +-
 drivers/net/nfp/nfp_net_meta.c              |   1 +
 drivers/net/nfp/nfp_rxtx.c                  |  12 +-
 drivers/net/nfp/nfp_rxtx.h                  |   9 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  17 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         | 275 +++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  25 ++
 15 files changed, 905 insertions(+), 49 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 1/4] net/nfp: export more interfaces of NFDk
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
@ 2024-07-09  7:29     ` Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 2/4] net/nfp: support AVX2 Tx function Chaoyong He
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  7:29 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfdk/nfp_nfdk.h    | 45 ++++++++++++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
 #ifndef __NFP_NFDK_H__
 #define __NFP_NFDK_H__
 
+#include <nfp_platform.h>
+
 #include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
 
 #define NFDK_TX_DESC_PER_SIMPLE_PKT     2
 
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
 	return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
 }
 
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+	/* First descriptor fits less data, so adjust for that */
+	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+			NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+		struct rte_mbuf *mb,
+		uint64_t flags)
+{
+	uint64_t ol_flags;
+	struct nfp_net_hw *hw = txq->hw;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+		return flags;
+
+	ol_flags = mb->ol_flags;
+
+	/* Set TCP csum offload if TSO enabled. */
+	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+		flags |= NFDK_DESC_TX_ENCAP;
+
+	/* IPv6 does not need checksum */
+	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+		flags |= NFDK_DESC_TX_L3_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	return flags;
+}
+
 uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 		uint32_t port_id);
 uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 		const struct rte_eth_txconf *tx_conf);
 int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 		struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *metadata);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 8b8c191b82..173aabf0b9 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
 #include "nfp_nfdk.h"
 
 #include <bus_pci_driver.h>
-#include <nfp_platform.h>
 #include <rte_malloc.h>
 
 #include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
-		struct rte_mbuf *mb,
-		uint64_t flags)
-{
-	uint64_t ol_flags;
-	struct nfp_net_hw *hw = txq->hw;
-
-	if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
-		return flags;
-
-	ol_flags = mb->ol_flags;
-
-	/* Set L4 csum offload if TSO/UFO enabled. */
-	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
-			(ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
-		flags |= NFDK_DESC_TX_ENCAP;
-
-	/* IPv6 does not need checksum */
-	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
-		flags |= NFDK_DESC_TX_L3_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	return flags;
-}
-
 /* Set TX descriptor for TSO of nfdk */
 static uint64_t
 nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 	return FLOWER_PKT_DATA_OFFSET;
 }
 
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
-	/* First descriptor fits less data, so adjust for that */
-	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
-			NFDK_TX_MAX_DATA_PER_DESC);
-}
-
 static inline void
 nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
 		uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 	return nop_slots;
 }
 
-static int
+int
 nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata)
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 2/4] net/nfp: support AVX2 Tx function
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-07-09  7:29     ` Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  7:29 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/meson.build                 |  20 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |   1 +
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  12 +
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   3 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   3 +-
 drivers/net/nfp/nfp_rxtx.h                  |   5 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  13 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         |  21 +
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  16 +
 12 files changed, 573 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..463a482a32 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
         'flower/nfp_flower_service.c',
         'nfd3/nfp_nfd3_dp.c',
         'nfdk/nfp_nfdk_dp.c',
+        'nfdk/nfp_nfdk_vec_stub.c',
         'nfpcore/nfp_cppcore.c',
         'nfpcore/nfp_crc.c',
         'nfpcore/nfp_elf.c',
@@ -43,7 +44,26 @@ sources = files(
         'nfp_net_flow.c',
         'nfp_net_meta.c',
         'nfp_rxtx.c',
+        'nfp_rxtx_vec_stub.c',
         'nfp_service.c',
 )
 
+if arch_subdir == 'x86'
+        includes += include_directories('../../common/nfp')
+
+        avx2_sources = files(
+                'nfdk/nfp_nfdk_vec_avx2_dp.c',
+                'nfp_rxtx_vec_avx2.c',
+        )
+
+        nfp_avx2_lib = static_library('nfp_avx2_lib',
+                avx2_sources,
+                dependencies: [static_rte_ethdev, static_rte_bus_pci],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2']
+        )
+
+        objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
 deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
 #include "../flower/nfp_flower.h"
 #include "../nfp_logs.h"
 #include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->tx_queues[queue_idx] = txq;
 	txq->hw = hw;
 	txq->hw_priv = dev->process_private;
+	txq->simple_always = true;
 
 	/*
 	 * Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 
 	return 0;
 }
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+		struct nfp_net_hw *hw)
+{
+	if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+		return false;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+		return true;
+
+	if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+		return true;
+
+	return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+		(NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *des_addr,
+		uint64_t *des_meta,
+		bool repr_flag)
+{
+	int ret;
+	__m128i dma_addr;
+	__m128i dma_hi;
+	__m128i data_off;
+	__m128i dlen_type;
+	uint64_t metadata;
+
+	if (repr_flag) {
+		metadata = NFDK_DESC_TX_CHAIN_META;
+	} else {
+		ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	data_off = _mm_set_epi64x(0, pkt->data_off);
+	dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+	dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+	dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+	*des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+			_mm_slli_epi64(dlen_type, 16)), 0);
+
+	*des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+	return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf *pkt,
+		bool repr_flag)
+{
+	int ret;
+	__m128i des_data;
+	uint64_t des_addr;
+	uint64_t des_meta;
+
+	ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+			&des_meta, repr_flag);
+	if (unlikely(ret != 0))
+		return ret;
+
+	txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+	if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+		txq->data_pending += pkt->data_len;
+	else
+		txq->data_pending = 0;
+
+	des_data = _mm_set_epi64x(des_meta, des_addr);
+
+	_mm_store_si128((void *)txds, des_data);
+
+	return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf **pkt,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t i;
+	__m256i des_data0_1;
+	__m256i des_data2_3;
+	uint64_t des_addr[4];
+	uint64_t des_meta[4];
+
+	for (i = 0; i < 4; i++) {
+		ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+				&des_addr[i], &des_meta[i], repr_flag);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	for (i = 0; i < 4; i++) {
+		txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+		if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+			txq->data_pending += pkt[i]->data_len;
+		else
+			txq->data_pending = 0;
+	}
+
+	des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+	des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+	_mm256_store_si256((void *)txds, des_data0_1);
+	_mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+	return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+		struct rte_mbuf **tx_pkts)
+{
+	__m256i mbuf_room0_1;
+	__m256i mbuf_room2_3;
+
+	mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+			(uintptr_t)tx_pkts[0]);
+	mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+			(uintptr_t)tx_pkts[2]);
+
+	_mm256_store_si256((void *)mbuf, mbuf_room0_1);
+	_mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts,
+		uint16_t simple_close,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t npkts = 0;
+	uint16_t need_txds;
+	uint16_t free_descs;
+	struct rte_mbuf **lmbuf;
+	struct nfp_net_nfdk_tx_desc *ktxds;
+
+	PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+			txq->qidx, txq->wr_p, nb_pkts);
+
+	need_txds = nb_pkts << 1;
+	if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+		nfp_net_tx_free_bufs(txq);
+
+	free_descs = nfp_net_nfdk_free_tx_desc(txq);
+	if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+		if (unlikely(simple_close > 0))
+			goto xmit_end;
+
+		return 0;
+	}
+
+	PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+	/* Sending packets */
+	while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+		ktxds = &txq->ktxds[txq->wr_p];
+		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+		/*
+		 * If can not send burst, just send one.
+		 * 1. Tx ring will come to the tail.
+		 * 2. Do not need to send 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. If free descriptors are not enough.
+		 */
+		if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+				(nb_pkts - npkts) < 4 ||
+				((uintptr_t)ktxds & 0x1F) != 0 ||
+				free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+			ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+					ktxds, tx_pkts[npkts], repr_flag);
+			if (unlikely(ret != 0))
+				goto xmit_end;
+
+			rte_pktmbuf_free(*lmbuf);
+
+			_mm_storel_epi64((void *)lmbuf,
+					_mm_loadu_si128((void *)&tx_pkts[npkts]));
+			npkts++;
+			free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+			continue;
+		}
+
+		ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+				&tx_pkts[npkts], repr_flag);
+		if (unlikely(ret != 0))
+			goto xmit_end;
+
+		rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+		nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+		npkts += 4;
+		free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+	}
+
+xmit_end:
+	/* Increment write pointers. Force memory write before we let HW know */
+	rte_wmb();
+	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+	return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t i;
+	uint16_t wr_p;
+	uint16_t nop_slots;
+	__m128i zero_128 = _mm_setzero_si128();
+	__m256i zero_256 = _mm256_setzero_si256();
+
+	wr_p = txq->wr_p;
+	nop_slots = D_BLOCK_CPL(wr_p);
+
+	for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+		_mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+		_mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+	}
+
+	for (; i >= 2; i -= 2, wr_p += 2) {
+		_mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+		_mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	for (; i >= 1; i--, wr_p++) {
+		_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+		_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	txq->data_pending = 0;
+	txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+	(*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t wr_p;
+	__m128i zero_128 = _mm_setzero_si128();
+
+	wr_p = txq->wr_p;
+
+	_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+	rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+	_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+	txq->wr_p = D_IDX(txq, wr_p + 1);
+	(*simple_close)++;
+
+	return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+		struct rte_mbuf *pkt,
+		bool *simple_flag,
+		bool *pending_flag,
+		uint16_t *data_pending,
+		uint32_t *wr_p,
+		uint16_t *simple_close)
+{
+	uint32_t data_pending_temp;
+
+	/* Let the first descriptor index even before send simple packets */
+	if (!(*simple_flag)) {
+		if ((*wr_p & 0x1) == 0x1)
+			*wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+		*simple_flag = true;
+	}
+
+	/* Simple packets only need one close block operation */
+	if (!(*pending_flag)) {
+		if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+			*pending_flag = true;
+			return;
+		}
+
+		data_pending_temp = *data_pending + pkt->data_len;
+		if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+			nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+			*pending_flag = true;
+			return;
+		}
+
+		*data_pending = data_pending_temp;
+
+		*wr_p += 2;
+	}
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts,
+		uint16_t *simple_close)
+{
+	uint32_t wr_p;
+	uint16_t simple_idx;
+	struct rte_mbuf *pkt;
+	uint16_t data_pending;
+	bool simple_flag = false;
+	bool pending_flag = false;
+	uint16_t simple_count = 0;
+
+	*simple_close = 0;
+	wr_p = txq->wr_p;
+	data_pending = txq->data_pending;
+
+	for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+		pkt = tx_pkts[simple_idx];
+		if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		simple_count++;
+		if (!txq->simple_always)
+			nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+					&pending_flag, &data_pending, &wr_p, simple_close);
+	}
+
+	return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts)
+{
+	uint16_t others_idx;
+	struct rte_mbuf *pkt;
+	uint16_t others_count = 0;
+
+	for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+		pkt = tx_pkts[others_idx];
+		if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		others_count++;
+	}
+
+	return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t i;
+	uint16_t avail = 0;
+	uint16_t simple_close;
+	uint16_t simple_count;
+	uint16_t simple_avail;
+	uint16_t others_count;
+	uint16_t others_avail;
+	struct nfp_net_txq *txq = tx_queue;
+
+	for (i = 0; i < nb_pkts; i++) {
+		simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+				nb_pkts, &simple_close);
+		if (simple_count > 0) {
+			if (!txq->simple_always)
+				txq->simple_always = true;
+
+			simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+					tx_pkts + i, simple_count, simple_close,
+					false);
+
+			avail += simple_avail;
+			if (simple_avail != simple_count)
+				break;
+
+			i += simple_count;
+		}
+
+		if (i == nb_pkts)
+			break;
+
+		others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+				i, nb_pkts);
+
+		if (txq->simple_always)
+			txq->simple_always = false;
+
+		others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+				tx_pkts + i, others_count, false);
+
+		avail += others_avail;
+		if (others_avail != others_count)
+			break;
+
+		i += others_count;
+	}
+
+	return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+		__rte_unused struct rte_mbuf **tx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
 
 /* 64-bit per app capabilities */
 #define NFP_NET_APP_CAP_SP_INDIFF       RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
 
 #include "nfp_logs.h"
 #include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
 
 #define NFP_VF_DRIVER_NAME net_nfp_vf
 
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
 	/** Used by NFDk only */
 	uint16_t data_pending;
 
+	/** Used by NFDk vector xmit only */
+	bool simple_always;
+
 	/**
 	 * At this point 58 bytes have been used for all the fields in the
-	 * TX critical path. We have room for 6 bytes and still all placed
+	 * TX critical path. We have room for 5 bytes and still all placed
 	 * in a cache line.
 	 */
 	uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+		return true;
+
+	return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+	return false;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 3/4] net/nfp: support AVX2 Rx function
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 2/4] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-07-09  7:29     ` Chaoyong He
  2024-07-09  7:29     ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
  4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  7:29 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.

Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_ethdev.c        |   2 +-
 drivers/net/nfp/nfp_ethdev_vf.c     |   2 +-
 drivers/net/nfp/nfp_net_meta.c      |   1 +
 drivers/net/nfp/nfp_rxtx.c          |  10 ++
 drivers/net/nfp/nfp_rxtx.h          |   1 +
 drivers/net/nfp/nfp_rxtx_vec.h      |   4 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c |   9 +
 8 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
 		rte_be32_t meta_header,
 		struct nfp_net_meta_parsed *meta)
 {
+	meta->flags = 0;
 	meta->flags |= (1 << NFP_NET_META_HASH);
 	meta->hash_type = rte_be_to_cpu_32(meta_header);
 	meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
 
 /*
  * The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 	info->conf.offloads = dev_info.tx_offload_capa &
 			dev->data->dev_conf.txmode.offloads;
 }
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+	else
+		eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
 void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
 
 bool nfp_net_get_avx2_supported(void);
 
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
 
 #include <stdbool.h>
 
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
 #include <rte_cpuflags.h>
 #include <rte_vect.h>
 
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
 #include "nfp_rxtx_vec.h"
 
 bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
 
 	return false;
 }
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	_mm_storel_epi64((void *)rxds, vaddr0);
+
+	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i vaddr1;
+	__m128i vaddr2;
+	__m128i vaddr3;
+	__m128i vaddr0_1;
+	__m128i vaddr2_3;
+	__m256i vaddr0_3;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+			vaddr2_3, 1);
+
+	_mm256_store_si256((void *)rxds, vaddr0_3);
+
+	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rx_pkt)
+{
+	struct nfp_net_hw *hw = rxq->hw;
+	struct nfp_net_meta_parsed meta;
+
+	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+	/* Size of the whole packet. We just support 1 segment */
+	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+	/* Filling the received mbuf with packet info */
+	if (hw->rx_offset)
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+	else
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+	rx_pkt->port = rxq->port_id;
+	rx_pkt->nb_segs = 1;
+	rx_pkt->next = NULL;
+
+	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+	/* Checking the checksum flag */
+	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb,
+		struct rte_mbuf *rx_pkt)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+	nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb,
+		struct rte_mbuf **rx_pkts)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+	__m256i data = _mm256_loadu_si256((void *)rxds);
+
+	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+		return false;
+
+	return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t avail;
+	uint16_t nb_hold;
+	bool burst_receive;
+	struct rte_mbuf **rxb;
+	struct nfp_net_rx_desc *rxds;
+	struct nfp_net_rxq *rxq = rx_queue;
+
+	if (unlikely(rxq == NULL)) {
+		PMD_RX_LOG(ERR, "RX Bad queue");
+		return 0;
+	}
+
+	avail = 0;
+	nb_hold = 0;
+	burst_receive = true;
+	while (avail < nb_pkts) {
+		rxds = &rxq->rxds[rxq->rd_p];
+		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+				& PCIE_DESC_RX_DD) == 0)
+			goto recv_end;
+
+		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+		if ((rxq->rd_p & 0x3) == 0) {
+			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+		}
+
+		if ((rxq->rd_p & 0x7) == 0) {
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+		}
+
+		/*
+		 * If can not receive burst, just receive one.
+		 * 1. Rx ring will coming to the tail.
+		 * 2. Do not need to receive 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+		 */
+		if ((rxq->rx_count - rxq->rd_p) < 4 ||
+				(nb_pkts - avail) < 4 ||
+				((uintptr_t)rxds & 0x1F) != 0 ||
+				!burst_receive) {
+			_mm_storel_epi64((void *)&rx_pkts[avail],
+					_mm_loadu_si128((void *)rxb));
+
+			/* Allocate a new mbuf into the software ring. */
+			if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+				PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+						rxq->port_id, rxq->qidx);
+				nfp_net_mbuf_alloc_failed(rxq);
+				goto recv_end;
+			}
+
+			nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+			avail++;
+			nb_hold++;
+			continue;
+		}
+
+		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+		if (!burst_receive)
+			continue;
+
+		_mm256_storeu_si256((void *)&rx_pkts[avail],
+				_mm256_loadu_si256((void *)rxb));
+
+		/* Allocate 4 new mbufs into the software ring. */
+		if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+			burst_receive = false;
+			continue;
+		}
+
+		nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+		avail += 4;
+		nb_hold += 4;
+	}
+
+recv_end:
+	if (nb_hold == 0)
+		return nb_hold;
+
+	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+	nb_hold += rxq->nb_rx_hold;
+
+	/*
+	 * FL descriptors needs to be written before incrementing the
+	 * FL queue WR pointer
+	 */
+	rte_wmb();
+	if (nb_hold > rxq->rx_free_thresh) {
+		PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+				rxq->port_id, rxq->qidx, nb_hold, avail);
+		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+		nb_hold = 0;
+	}
+	rxq->nb_rx_hold = nb_hold;
+
+	return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 
 #include <rte_common.h>
+#include <rte_mbuf_core.h>
 
 #include "nfp_rxtx_vec.h"
 
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
 {
 	return false;
 }
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+		__rte_unused struct rte_mbuf **rx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
                       ` (2 preceding siblings ...)
  2024-07-09  7:29     ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-07-09  7:29     ` Chaoyong He
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
  4 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  7:29 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Vector AVX2 Rx function supports parsing packet type and set it to mbuf.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_net_common.c    | 2 +-
 drivers/net/nfp/nfp_rxtx.c          | 2 +-
 drivers/net/nfp/nfp_rxtx.h          | 3 +++
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
 4 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 08693d5fba..3d916cd147 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1455,7 +1455,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
 		RTE_PTYPE_INNER_L4_SCTP,
 	};
 
-	if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+	if (dev->rx_pkt_burst == NULL)
 		return NULL;
 
 	net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
  * @param mb
  *   Mbuf to set the packet type.
  */
-static void
+void
 nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
 		struct nfp_net_rx_desc *rxds,
 		struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
 void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *mb);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
 
 	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
 
+	nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
 	/* Checking the checksum flag */
 	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
 }
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 0/5] support AVX2 instruction Rx/Tx function
  2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
                       ` (3 preceding siblings ...)
  2024-07-09  7:29     ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-09  8:24     ` Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
                         ` (5 more replies)
  4 siblings, 6 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  8:24 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Chaoyong He

This patch series add the support of Rx/Tx function using the
AVX2 instruction.

---
v4:
* Add commit to solve the compile problem on 32-bit OS.
v3:
* Fix the 'meson.build' file to solve the compile problem.
v2:
* Rebase to the latest main branch.
---

Long Wu (5):
  net/nfp: fix compile fail on 32-bit OS
  net/nfp: export more interfaces of NFDk
  net/nfp: support AVX2 Tx function
  net/nfp: support AVX2 Rx function
  net/nfp: vector Rx function supports parsing ptype

 drivers/net/nfp/meson.build                 |  21 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |  46 +++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  55 +--
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   5 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   5 +-
 drivers/net/nfp/nfp_net_common.c            |   2 +-
 drivers/net/nfp/nfp_net_meta.c              |   1 +
 drivers/net/nfp/nfp_rxtx.c                  |  12 +-
 drivers/net/nfp/nfp_rxtx.h                  |   9 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  17 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         | 275 +++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  25 ++
 15 files changed, 906 insertions(+), 49 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
@ 2024-07-09  8:24       ` Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
                         ` (4 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  8:24 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, stable, Chaoyong He, Peng Zhang

From: Long Wu <long.wu@corigine.com>

NFP PMD only support compile on 64-bit linux OS, add exit logic in
other conditions.

Fixes: 8741a9074536 ("net/nfp: disable for 32-bit meson builds")
Cc: stable@dpdk.org

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
Reviewed-by: Peng Zhang <peng.zhang@corigine.com>
---
 drivers/net/nfp/meson.build | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..7216c8dff9 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -4,6 +4,7 @@
 if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
     build = false
     reason = 'only supported on 64-bit Linux'
+    subdir_done()
 endif
 
 sources = files(
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 2/5] net/nfp: export more interfaces of NFDk
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
@ 2024-07-09  8:24       ` Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
                         ` (3 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  8:24 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

NFP will support NFDk vector Tx function, so move some
functions to header file for use by vector Tx function.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfdk/nfp_nfdk.h    | 45 ++++++++++++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c | 43 +---------------------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 2767fd51cd..89a98d13f3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -6,7 +6,10 @@
 #ifndef __NFP_NFDK_H__
 #define __NFP_NFDK_H__
 
+#include <nfp_platform.h>
+
 #include "../nfp_rxtx.h"
+#include "nfp_net_common.h"
 
 #define NFDK_TX_DESC_PER_SIMPLE_PKT     2
 
@@ -161,6 +164,45 @@ nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
 	return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
 }
 
+static inline uint16_t
+nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
+{
+	/* First descriptor fits less data, so adjust for that */
+	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
+			NFDK_TX_MAX_DATA_PER_DESC);
+}
+
+/* Set TX CSUM offload flags in TX descriptor of nfdk */
+static inline uint64_t
+nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
+		struct rte_mbuf *mb,
+		uint64_t flags)
+{
+	uint64_t ol_flags;
+	struct nfp_net_hw *hw = txq->hw;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
+		return flags;
+
+	ol_flags = mb->ol_flags;
+
+	/* Set TCP csum offload if TSO enabled. */
+	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
+		flags |= NFDK_DESC_TX_ENCAP;
+
+	/* IPv6 does not need checksum */
+	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
+		flags |= NFDK_DESC_TX_L3_CSUM;
+
+	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
+		flags |= NFDK_DESC_TX_L4_CSUM;
+
+	return flags;
+}
+
 uint32_t nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 		uint32_t port_id);
 uint16_t nfp_net_nfdk_xmit_pkts_common(void *tx_queue,
@@ -177,5 +219,8 @@ int nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 		const struct rte_eth_txconf *tx_conf);
 int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 		struct rte_mbuf *pkt);
+int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *metadata);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 8b8c191b82..173aabf0b9 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -6,7 +6,6 @@
 #include "nfp_nfdk.h"
 
 #include <bus_pci_driver.h>
-#include <nfp_platform.h>
 #include <rte_malloc.h>
 
 #include "../flower/nfp_flower.h"
@@ -15,38 +14,6 @@
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
-/* Set TX CSUM offload flags in TX descriptor of nfdk */
-static uint64_t
-nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq,
-		struct rte_mbuf *mb,
-		uint64_t flags)
-{
-	uint64_t ol_flags;
-	struct nfp_net_hw *hw = txq->hw;
-
-	if ((hw->super.ctrl & NFP_NET_CFG_CTRL_TXCSUM) == 0)
-		return flags;
-
-	ol_flags = mb->ol_flags;
-
-	/* Set L4 csum offload if TSO/UFO enabled. */
-	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) != 0 ||
-			(ol_flags & RTE_MBUF_F_TX_UDP_SEG) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) != 0)
-		flags |= NFDK_DESC_TX_ENCAP;
-
-	/* IPv6 does not need checksum */
-	if ((ol_flags & RTE_MBUF_F_TX_IP_CKSUM) != 0)
-		flags |= NFDK_DESC_TX_L3_CSUM;
-
-	if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) != 0)
-		flags |= NFDK_DESC_TX_L4_CSUM;
-
-	return flags;
-}
-
 /* Set TX descriptor for TSO of nfdk */
 static uint64_t
 nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq,
@@ -100,14 +67,6 @@ nfp_flower_nfdk_pkt_add_metadata(struct rte_mbuf *mbuf,
 	return FLOWER_PKT_DATA_OFFSET;
 }
 
-static inline uint16_t
-nfp_net_nfdk_headlen_to_segs(uint16_t headlen)
-{
-	/* First descriptor fits less data, so adjust for that */
-	return DIV_ROUND_UP(headlen + NFDK_TX_MAX_DATA_PER_DESC - NFDK_TX_MAX_DATA_PER_HEAD,
-			NFDK_TX_MAX_DATA_PER_DESC);
-}
-
 static inline void
 nfp_net_nfdk_tx_close_block(struct nfp_net_txq *txq,
 		uint32_t nop_slots)
@@ -170,7 +129,7 @@ nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 	return nop_slots;
 }
 
-static int
+int
 nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata)
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 3/5] net/nfp: support AVX2 Tx function
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
@ 2024-07-09  8:24       ` Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 4/5] net/nfp: support AVX2 Rx function Chaoyong He
                         ` (2 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  8:24 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/meson.build                 |  20 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |   1 +
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  12 +
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   3 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   3 +-
 drivers/net/nfp/nfp_rxtx.h                  |   5 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  13 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         |  21 +
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  16 +
 12 files changed, 573 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index 7216c8dff9..58a066c2e3 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -17,6 +17,7 @@ sources = files(
         'flower/nfp_flower_service.c',
         'nfd3/nfp_nfd3_dp.c',
         'nfdk/nfp_nfdk_dp.c',
+        'nfdk/nfp_nfdk_vec_stub.c',
         'nfpcore/nfp_cppcore.c',
         'nfpcore/nfp_crc.c',
         'nfpcore/nfp_elf.c',
@@ -44,7 +45,26 @@ sources = files(
         'nfp_net_flow.c',
         'nfp_net_meta.c',
         'nfp_rxtx.c',
+        'nfp_rxtx_vec_stub.c',
         'nfp_service.c',
 )
 
+if arch_subdir == 'x86'
+        includes += include_directories('../../common/nfp')
+
+        avx2_sources = files(
+                'nfdk/nfp_nfdk_vec_avx2_dp.c',
+                'nfp_rxtx_vec_avx2.c',
+        )
+
+        nfp_avx2_lib = static_library('nfp_avx2_lib',
+                avx2_sources,
+                dependencies: [static_rte_ethdev, static_rte_bus_pci],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2']
+        )
+
+        objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
 deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
 #include "../flower/nfp_flower.h"
 #include "../nfp_logs.h"
 #include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->tx_queues[queue_idx] = txq;
 	txq->hw = hw;
 	txq->hw_priv = dev->process_private;
+	txq->simple_always = true;
 
 	/*
 	 * Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 
 	return 0;
 }
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+		struct nfp_net_hw *hw)
+{
+	if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+		return false;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+		return true;
+
+	if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+		return true;
+
+	return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+		(NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *des_addr,
+		uint64_t *des_meta,
+		bool repr_flag)
+{
+	int ret;
+	__m128i dma_addr;
+	__m128i dma_hi;
+	__m128i data_off;
+	__m128i dlen_type;
+	uint64_t metadata;
+
+	if (repr_flag) {
+		metadata = NFDK_DESC_TX_CHAIN_META;
+	} else {
+		ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	data_off = _mm_set_epi64x(0, pkt->data_off);
+	dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+	dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+	dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+	*des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+			_mm_slli_epi64(dlen_type, 16)), 0);
+
+	*des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+	return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf *pkt,
+		bool repr_flag)
+{
+	int ret;
+	__m128i des_data;
+	uint64_t des_addr;
+	uint64_t des_meta;
+
+	ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+			&des_meta, repr_flag);
+	if (unlikely(ret != 0))
+		return ret;
+
+	txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+	if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+		txq->data_pending += pkt->data_len;
+	else
+		txq->data_pending = 0;
+
+	des_data = _mm_set_epi64x(des_meta, des_addr);
+
+	_mm_store_si128((void *)txds, des_data);
+
+	return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf **pkt,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t i;
+	__m256i des_data0_1;
+	__m256i des_data2_3;
+	uint64_t des_addr[4];
+	uint64_t des_meta[4];
+
+	for (i = 0; i < 4; i++) {
+		ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+				&des_addr[i], &des_meta[i], repr_flag);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	for (i = 0; i < 4; i++) {
+		txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+		if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+			txq->data_pending += pkt[i]->data_len;
+		else
+			txq->data_pending = 0;
+	}
+
+	des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+	des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+	_mm256_store_si256((void *)txds, des_data0_1);
+	_mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+	return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+		struct rte_mbuf **tx_pkts)
+{
+	__m256i mbuf_room0_1;
+	__m256i mbuf_room2_3;
+
+	mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+			(uintptr_t)tx_pkts[0]);
+	mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+			(uintptr_t)tx_pkts[2]);
+
+	_mm256_store_si256((void *)mbuf, mbuf_room0_1);
+	_mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts,
+		uint16_t simple_close,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t npkts = 0;
+	uint16_t need_txds;
+	uint16_t free_descs;
+	struct rte_mbuf **lmbuf;
+	struct nfp_net_nfdk_tx_desc *ktxds;
+
+	PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+			txq->qidx, txq->wr_p, nb_pkts);
+
+	need_txds = nb_pkts << 1;
+	if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+		nfp_net_tx_free_bufs(txq);
+
+	free_descs = nfp_net_nfdk_free_tx_desc(txq);
+	if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+		if (unlikely(simple_close > 0))
+			goto xmit_end;
+
+		return 0;
+	}
+
+	PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+	/* Sending packets */
+	while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+		ktxds = &txq->ktxds[txq->wr_p];
+		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+		/*
+		 * If can not send burst, just send one.
+		 * 1. Tx ring will come to the tail.
+		 * 2. Do not need to send 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. If free descriptors are not enough.
+		 */
+		if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+				(nb_pkts - npkts) < 4 ||
+				((uintptr_t)ktxds & 0x1F) != 0 ||
+				free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+			ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+					ktxds, tx_pkts[npkts], repr_flag);
+			if (unlikely(ret != 0))
+				goto xmit_end;
+
+			rte_pktmbuf_free(*lmbuf);
+
+			_mm_storel_epi64((void *)lmbuf,
+					_mm_loadu_si128((void *)&tx_pkts[npkts]));
+			npkts++;
+			free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+			continue;
+		}
+
+		ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+				&tx_pkts[npkts], repr_flag);
+		if (unlikely(ret != 0))
+			goto xmit_end;
+
+		rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+		nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+		npkts += 4;
+		free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+	}
+
+xmit_end:
+	/* Increment write pointers. Force memory write before we let HW know */
+	rte_wmb();
+	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+	return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t i;
+	uint16_t wr_p;
+	uint16_t nop_slots;
+	__m128i zero_128 = _mm_setzero_si128();
+	__m256i zero_256 = _mm256_setzero_si256();
+
+	wr_p = txq->wr_p;
+	nop_slots = D_BLOCK_CPL(wr_p);
+
+	for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+		_mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+		_mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+	}
+
+	for (; i >= 2; i -= 2, wr_p += 2) {
+		_mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+		_mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	for (; i >= 1; i--, wr_p++) {
+		_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+		_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	txq->data_pending = 0;
+	txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+	(*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t wr_p;
+	__m128i zero_128 = _mm_setzero_si128();
+
+	wr_p = txq->wr_p;
+
+	_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+	rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+	_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+	txq->wr_p = D_IDX(txq, wr_p + 1);
+	(*simple_close)++;
+
+	return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+		struct rte_mbuf *pkt,
+		bool *simple_flag,
+		bool *pending_flag,
+		uint16_t *data_pending,
+		uint32_t *wr_p,
+		uint16_t *simple_close)
+{
+	uint32_t data_pending_temp;
+
+	/* Let the first descriptor index even before send simple packets */
+	if (!(*simple_flag)) {
+		if ((*wr_p & 0x1) == 0x1)
+			*wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+		*simple_flag = true;
+	}
+
+	/* Simple packets only need one close block operation */
+	if (!(*pending_flag)) {
+		if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+			*pending_flag = true;
+			return;
+		}
+
+		data_pending_temp = *data_pending + pkt->data_len;
+		if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+			nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+			*pending_flag = true;
+			return;
+		}
+
+		*data_pending = data_pending_temp;
+
+		*wr_p += 2;
+	}
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts,
+		uint16_t *simple_close)
+{
+	uint32_t wr_p;
+	uint16_t simple_idx;
+	struct rte_mbuf *pkt;
+	uint16_t data_pending;
+	bool simple_flag = false;
+	bool pending_flag = false;
+	uint16_t simple_count = 0;
+
+	*simple_close = 0;
+	wr_p = txq->wr_p;
+	data_pending = txq->data_pending;
+
+	for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+		pkt = tx_pkts[simple_idx];
+		if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		simple_count++;
+		if (!txq->simple_always)
+			nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+					&pending_flag, &data_pending, &wr_p, simple_close);
+	}
+
+	return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts)
+{
+	uint16_t others_idx;
+	struct rte_mbuf *pkt;
+	uint16_t others_count = 0;
+
+	for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+		pkt = tx_pkts[others_idx];
+		if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		others_count++;
+	}
+
+	return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t i;
+	uint16_t avail = 0;
+	uint16_t simple_close;
+	uint16_t simple_count;
+	uint16_t simple_avail;
+	uint16_t others_count;
+	uint16_t others_avail;
+	struct nfp_net_txq *txq = tx_queue;
+
+	for (i = 0; i < nb_pkts; i++) {
+		simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+				nb_pkts, &simple_close);
+		if (simple_count > 0) {
+			if (!txq->simple_always)
+				txq->simple_always = true;
+
+			simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+					tx_pkts + i, simple_count, simple_close,
+					false);
+
+			avail += simple_avail;
+			if (simple_avail != simple_count)
+				break;
+
+			i += simple_count;
+		}
+
+		if (i == nb_pkts)
+			break;
+
+		others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+				i, nb_pkts);
+
+		if (txq->simple_always)
+			txq->simple_always = false;
+
+		others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+				tx_pkts + i, others_count, false);
+
+		avail += others_avail;
+		if (others_avail != others_count)
+			break;
+
+		i += others_count;
+	}
+
+	return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+		__rte_unused struct rte_mbuf **tx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
 
 /* 64-bit per app capabilities */
 #define NFP_NET_APP_CAP_SP_INDIFF       RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
 
 #include "nfp_logs.h"
 #include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
 
 #define NFP_VF_DRIVER_NAME net_nfp_vf
 
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
 	/** Used by NFDk only */
 	uint16_t data_pending;
 
+	/** Used by NFDk vector xmit only */
+	bool simple_always;
+
 	/**
 	 * At this point 58 bytes have been used for all the fields in the
-	 * TX critical path. We have room for 6 bytes and still all placed
+	 * TX critical path. We have room for 5 bytes and still all placed
 	 * in a cache line.
 	 */
 	uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+		return true;
+
+	return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+	return false;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 4/5] net/nfp: support AVX2 Rx function
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
                         ` (2 preceding siblings ...)
  2024-07-09  8:24       ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
@ 2024-07-09  8:24       ` Chaoyong He
  2024-07-09  8:24       ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
  2024-07-09 13:06       ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  8:24 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Peng Zhang, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.

Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_ethdev.c        |   2 +-
 drivers/net/nfp/nfp_ethdev_vf.c     |   2 +-
 drivers/net/nfp/nfp_net_meta.c      |   1 +
 drivers/net/nfp/nfp_rxtx.c          |  10 ++
 drivers/net/nfp/nfp_rxtx.h          |   1 +
 drivers/net/nfp/nfp_rxtx_vec.h      |   4 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c |   9 +
 8 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
 		rte_be32_t meta_header,
 		struct nfp_net_meta_parsed *meta)
 {
+	meta->flags = 0;
 	meta->flags |= (1 << NFP_NET_META_HASH);
 	meta->hash_type = rte_be_to_cpu_32(meta_header);
 	meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
 
 /*
  * The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 	info->conf.offloads = dev_info.tx_offload_capa &
 			dev->data->dev_conf.txmode.offloads;
 }
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+	else
+		eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
 void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
 
 bool nfp_net_get_avx2_supported(void);
 
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
 
 #include <stdbool.h>
 
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
 #include <rte_cpuflags.h>
 #include <rte_vect.h>
 
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
 #include "nfp_rxtx_vec.h"
 
 bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
 
 	return false;
 }
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	_mm_storel_epi64((void *)rxds, vaddr0);
+
+	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i vaddr1;
+	__m128i vaddr2;
+	__m128i vaddr3;
+	__m128i vaddr0_1;
+	__m128i vaddr2_3;
+	__m256i vaddr0_3;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+			vaddr2_3, 1);
+
+	_mm256_store_si256((void *)rxds, vaddr0_3);
+
+	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rx_pkt)
+{
+	struct nfp_net_hw *hw = rxq->hw;
+	struct nfp_net_meta_parsed meta;
+
+	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+	/* Size of the whole packet. We just support 1 segment */
+	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+	/* Filling the received mbuf with packet info */
+	if (hw->rx_offset)
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+	else
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+	rx_pkt->port = rxq->port_id;
+	rx_pkt->nb_segs = 1;
+	rx_pkt->next = NULL;
+
+	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+	/* Checking the checksum flag */
+	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb,
+		struct rte_mbuf *rx_pkt)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+	nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb,
+		struct rte_mbuf **rx_pkts)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+	__m256i data = _mm256_loadu_si256((void *)rxds);
+
+	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+		return false;
+
+	return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t avail;
+	uint16_t nb_hold;
+	bool burst_receive;
+	struct rte_mbuf **rxb;
+	struct nfp_net_rx_desc *rxds;
+	struct nfp_net_rxq *rxq = rx_queue;
+
+	if (unlikely(rxq == NULL)) {
+		PMD_RX_LOG(ERR, "RX Bad queue");
+		return 0;
+	}
+
+	avail = 0;
+	nb_hold = 0;
+	burst_receive = true;
+	while (avail < nb_pkts) {
+		rxds = &rxq->rxds[rxq->rd_p];
+		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+				& PCIE_DESC_RX_DD) == 0)
+			goto recv_end;
+
+		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+		if ((rxq->rd_p & 0x3) == 0) {
+			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+		}
+
+		if ((rxq->rd_p & 0x7) == 0) {
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+		}
+
+		/*
+		 * If can not receive burst, just receive one.
+		 * 1. Rx ring will coming to the tail.
+		 * 2. Do not need to receive 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+		 */
+		if ((rxq->rx_count - rxq->rd_p) < 4 ||
+				(nb_pkts - avail) < 4 ||
+				((uintptr_t)rxds & 0x1F) != 0 ||
+				!burst_receive) {
+			_mm_storel_epi64((void *)&rx_pkts[avail],
+					_mm_loadu_si128((void *)rxb));
+
+			/* Allocate a new mbuf into the software ring. */
+			if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+				PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+						rxq->port_id, rxq->qidx);
+				nfp_net_mbuf_alloc_failed(rxq);
+				goto recv_end;
+			}
+
+			nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+			avail++;
+			nb_hold++;
+			continue;
+		}
+
+		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+		if (!burst_receive)
+			continue;
+
+		_mm256_storeu_si256((void *)&rx_pkts[avail],
+				_mm256_loadu_si256((void *)rxb));
+
+		/* Allocate 4 new mbufs into the software ring. */
+		if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+			burst_receive = false;
+			continue;
+		}
+
+		nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+		avail += 4;
+		nb_hold += 4;
+	}
+
+recv_end:
+	if (nb_hold == 0)
+		return nb_hold;
+
+	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+	nb_hold += rxq->nb_rx_hold;
+
+	/*
+	 * FL descriptors needs to be written before incrementing the
+	 * FL queue WR pointer
+	 */
+	rte_wmb();
+	if (nb_hold > rxq->rx_free_thresh) {
+		PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+				rxq->port_id, rxq->qidx, nb_hold, avail);
+		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+		nb_hold = 0;
+	}
+	rxq->nb_rx_hold = nb_hold;
+
+	return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 
 #include <rte_common.h>
+#include <rte_mbuf_core.h>
 
 #include "nfp_rxtx_vec.h"
 
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
 {
 	return false;
 }
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+		__rte_unused struct rte_mbuf **rx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
                         ` (3 preceding siblings ...)
  2024-07-09  8:24       ` [PATCH v4 4/5] net/nfp: support AVX2 Rx function Chaoyong He
@ 2024-07-09  8:24       ` Chaoyong He
  2024-07-09 13:06       ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit
  5 siblings, 0 replies; 26+ messages in thread
From: Chaoyong He @ 2024-07-09  8:24 UTC (permalink / raw)
  To: dev; +Cc: oss-drivers, Long Wu, Chaoyong He

From: Long Wu <long.wu@corigine.com>

Vector AVX2 Rx function supports parsing packet type and set it to mbuf.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_net_common.c    | 2 +-
 drivers/net/nfp/nfp_rxtx.c          | 2 +-
 drivers/net/nfp/nfp_rxtx.h          | 3 +++
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 2 ++
 4 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_net_common.c b/drivers/net/nfp/nfp_net_common.c
index 08693d5fba..3d916cd147 100644
--- a/drivers/net/nfp/nfp_net_common.c
+++ b/drivers/net/nfp/nfp_net_common.c
@@ -1455,7 +1455,7 @@ nfp_net_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
 		RTE_PTYPE_INNER_L4_SCTP,
 	};
 
-	if (dev->rx_pkt_burst != nfp_net_recv_pkts)
+	if (dev->rx_pkt_burst == NULL)
 		return NULL;
 
 	net_hw = dev->data->dev_private;
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 4fc3374987..da41a0e663 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -350,7 +350,7 @@ nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
  * @param mb
  *   Mbuf to set the packet type.
  */
-static void
+void
 nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
 		struct nfp_net_rx_desc *rxds,
 		struct rte_mbuf *mb)
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index fff8371991..c717d97003 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -245,5 +245,8 @@ void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
 void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
+void nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *mb);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 7c18213624..508ec7faa5 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -111,6 +111,8 @@ nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
 
 	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
 
+	nfp_net_parse_ptype(rxq, rxds, rx_pkt);
+
 	/* Checking the checksum flag */
 	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
 }
-- 
2.39.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v4 0/5] support AVX2 instruction Rx/Tx function
  2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
                         ` (4 preceding siblings ...)
  2024-07-09  8:24       ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
@ 2024-07-09 13:06       ` Ferruh Yigit
  5 siblings, 0 replies; 26+ messages in thread
From: Ferruh Yigit @ 2024-07-09 13:06 UTC (permalink / raw)
  To: Chaoyong He, dev; +Cc: oss-drivers

On 7/9/2024 9:24 AM, Chaoyong He wrote:
> This patch series add the support of Rx/Tx function using the
> AVX2 instruction.
> 
> ---
> v4:
> * Add commit to solve the compile problem on 32-bit OS.
> v3:
> * Fix the 'meson.build' file to solve the compile problem.
> v2:
> * Rebase to the latest main branch.
> ---
> 
> Long Wu (5):
>   net/nfp: fix compile fail on 32-bit OS
>   net/nfp: export more interfaces of NFDk
>   net/nfp: support AVX2 Tx function
>   net/nfp: support AVX2 Rx function
>   net/nfp: vector Rx function supports parsing ptype
>

Series applied to dpdk-next-net/main, thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2024-07-09 13:07 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
2024-06-19  2:59 ` [PATCH 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-06-19  2:59 ` [PATCH 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-06-19  2:59 ` [PATCH 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-06-19  2:59 ` [PATCH 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-06 18:51 ` [PATCH 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-08  5:52   ` Chaoyong He
2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
2024-07-08  5:58   ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-08  5:58   ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-08  5:58   ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-08  5:58   ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-08 11:45   ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-09  1:13     ` Chaoyong He
2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
2024-07-09  7:29     ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09  7:29     ` [PATCH v3 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-09  7:29     ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-09  7:29     ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
2024-07-09  8:24       ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09  8:24       ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 4/5] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09 13:06       ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).