DPDK patches and discussions
 help / color / mirror / Atom feed
From: Chaoyong He <chaoyong.he@corigine.com>
To: dev@dpdk.org
Cc: oss-drivers@corigine.com, Long Wu <long.wu@corigine.com>,
	Chaoyong He <chaoyong.he@corigine.com>
Subject: [PATCH v3 2/4] net/nfp: support AVX2 Tx function
Date: Tue,  9 Jul 2024 15:29:19 +0800	[thread overview]
Message-ID: <20240709072921.246520-3-chaoyong.he@corigine.com> (raw)
In-Reply-To: <20240709072921.246520-1-chaoyong.he@corigine.com>

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.

Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/meson.build                 |  20 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |   1 +
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  12 +
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   3 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   3 +-
 drivers/net/nfp/nfp_rxtx.h                  |   5 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  13 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         |  21 +
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  16 +
 12 files changed, 573 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..463a482a32 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
         'flower/nfp_flower_service.c',
         'nfd3/nfp_nfd3_dp.c',
         'nfdk/nfp_nfdk_dp.c',
+        'nfdk/nfp_nfdk_vec_stub.c',
         'nfpcore/nfp_cppcore.c',
         'nfpcore/nfp_crc.c',
         'nfpcore/nfp_elf.c',
@@ -43,7 +44,26 @@ sources = files(
         'nfp_net_flow.c',
         'nfp_net_meta.c',
         'nfp_rxtx.c',
+        'nfp_rxtx_vec_stub.c',
         'nfp_service.c',
 )
 
+if arch_subdir == 'x86'
+        includes += include_directories('../../common/nfp')
+
+        avx2_sources = files(
+                'nfdk/nfp_nfdk_vec_avx2_dp.c',
+                'nfp_rxtx_vec_avx2.c',
+        )
+
+        nfp_avx2_lib = static_library('nfp_avx2_lib',
+                avx2_sources,
+                dependencies: [static_rte_ethdev, static_rte_bus_pci],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2']
+        )
+
+        objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
 deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq,
 int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
 		struct nfp_net_txq *txq,
 		uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
 #include "../flower/nfp_flower.h"
 #include "../nfp_logs.h"
 #include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->tx_queues[queue_idx] = txq;
 	txq->hw = hw;
 	txq->hw_priv = dev->process_private;
+	txq->simple_always = true;
 
 	/*
 	 * Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 
 	return 0;
 }
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+		struct nfp_net_hw *hw)
+{
+	if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+		return false;
+
+	if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+		return true;
+
+	if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+		return true;
+
+	return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+		(NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+		struct nfp_net_txq *txq,
+		uint64_t *des_addr,
+		uint64_t *des_meta,
+		bool repr_flag)
+{
+	int ret;
+	__m128i dma_addr;
+	__m128i dma_hi;
+	__m128i data_off;
+	__m128i dlen_type;
+	uint64_t metadata;
+
+	if (repr_flag) {
+		metadata = NFDK_DESC_TX_CHAIN_META;
+	} else {
+		ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	data_off = _mm_set_epi64x(0, pkt->data_off);
+	dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), data_off);
+	dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+	dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | NFDK_SIMPLE_DES_TYPE);
+
+	*des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, dma_addr),
+			_mm_slli_epi64(dlen_type, 16)), 0);
+
+	*des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+	return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf *pkt,
+		bool repr_flag)
+{
+	int ret;
+	__m128i des_data;
+	uint64_t des_addr;
+	uint64_t des_meta;
+
+	ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+			&des_meta, repr_flag);
+	if (unlikely(ret != 0))
+		return ret;
+
+	txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+	if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+		txq->data_pending += pkt->data_len;
+	else
+		txq->data_pending = 0;
+
+	des_data = _mm_set_epi64x(des_meta, des_addr);
+
+	_mm_store_si128((void *)txds, des_data);
+
+	return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+		struct nfp_net_nfdk_tx_desc *txds,
+		struct rte_mbuf **pkt,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t i;
+	__m256i des_data0_1;
+	__m256i des_data2_3;
+	uint64_t des_addr[4];
+	uint64_t des_meta[4];
+
+	for (i = 0; i < 4; i++) {
+		ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+				&des_addr[i], &des_meta[i], repr_flag);
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+	for (i = 0; i < 4; i++) {
+		txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+		if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+			txq->data_pending += pkt[i]->data_len;
+		else
+			txq->data_pending = 0;
+	}
+
+	des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], des_addr[0]);
+	des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], des_addr[2]);
+
+	_mm256_store_si256((void *)txds, des_data0_1);
+	_mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+	return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+		struct rte_mbuf **tx_pkts)
+{
+	__m256i mbuf_room0_1;
+	__m256i mbuf_room2_3;
+
+	mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+			(uintptr_t)tx_pkts[0]);
+	mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+			(uintptr_t)tx_pkts[2]);
+
+	_mm256_store_si256((void *)mbuf, mbuf_room0_1);
+	_mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts,
+		uint16_t simple_close,
+		bool repr_flag)
+{
+	int ret;
+	uint16_t npkts = 0;
+	uint16_t need_txds;
+	uint16_t free_descs;
+	struct rte_mbuf **lmbuf;
+	struct nfp_net_nfdk_tx_desc *ktxds;
+
+	PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+			txq->qidx, txq->wr_p, nb_pkts);
+
+	need_txds = nb_pkts << 1;
+	if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || nfp_net_nfdk_txq_full(txq))
+		nfp_net_tx_free_bufs(txq);
+
+	free_descs = nfp_net_nfdk_free_tx_desc(txq);
+	if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+		if (unlikely(simple_close > 0))
+			goto xmit_end;
+
+		return 0;
+	}
+
+	PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, nb_pkts);
+
+	/* Sending packets */
+	while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+		ktxds = &txq->ktxds[txq->wr_p];
+		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+		/*
+		 * If can not send burst, just send one.
+		 * 1. Tx ring will come to the tail.
+		 * 2. Do not need to send 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. If free descriptors are not enough.
+		 */
+		if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+				(nb_pkts - npkts) < 4 ||
+				((uintptr_t)ktxds & 0x1F) != 0 ||
+				free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+			ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+					ktxds, tx_pkts[npkts], repr_flag);
+			if (unlikely(ret != 0))
+				goto xmit_end;
+
+			rte_pktmbuf_free(*lmbuf);
+
+			_mm_storel_epi64((void *)lmbuf,
+					_mm_loadu_si128((void *)&tx_pkts[npkts]));
+			npkts++;
+			free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+			continue;
+		}
+
+		ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+				&tx_pkts[npkts], repr_flag);
+		if (unlikely(ret != 0))
+			goto xmit_end;
+
+		rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+		nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+		npkts += 4;
+		free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+	}
+
+xmit_end:
+	/* Increment write pointers. Force memory write before we let HW know */
+	rte_wmb();
+	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + simple_close));
+
+	return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t i;
+	uint16_t wr_p;
+	uint16_t nop_slots;
+	__m128i zero_128 = _mm_setzero_si128();
+	__m256i zero_256 = _mm256_setzero_si256();
+
+	wr_p = txq->wr_p;
+	nop_slots = D_BLOCK_CPL(wr_p);
+
+	for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+		_mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+		_mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+	}
+
+	for (; i >= 2; i -= 2, wr_p += 2) {
+		_mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+		_mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	for (; i >= 1; i--, wr_p++) {
+		_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+		rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+		_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+	}
+
+	txq->data_pending = 0;
+	txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+	(*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+		uint16_t *simple_close)
+{
+	uint16_t wr_p;
+	__m128i zero_128 = _mm_setzero_si128();
+
+	wr_p = txq->wr_p;
+
+	_mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+	rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+	_mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+	txq->wr_p = D_IDX(txq, wr_p + 1);
+	(*simple_close)++;
+
+	return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+		struct rte_mbuf *pkt,
+		bool *simple_flag,
+		bool *pending_flag,
+		uint16_t *data_pending,
+		uint32_t *wr_p,
+		uint16_t *simple_close)
+{
+	uint32_t data_pending_temp;
+
+	/* Let the first descriptor index even before send simple packets */
+	if (!(*simple_flag)) {
+		if ((*wr_p & 0x1) == 0x1)
+			*wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, simple_close);
+
+		*simple_flag = true;
+	}
+
+	/* Simple packets only need one close block operation */
+	if (!(*pending_flag)) {
+		if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+			*pending_flag = true;
+			return;
+		}
+
+		data_pending_temp = *data_pending + pkt->data_len;
+		if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+			nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, simple_close);
+			*pending_flag = true;
+			return;
+		}
+
+		*data_pending = data_pending_temp;
+
+		*wr_p += 2;
+	}
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts,
+		uint16_t *simple_close)
+{
+	uint32_t wr_p;
+	uint16_t simple_idx;
+	struct rte_mbuf *pkt;
+	uint16_t data_pending;
+	bool simple_flag = false;
+	bool pending_flag = false;
+	uint16_t simple_count = 0;
+
+	*simple_close = 0;
+	wr_p = txq->wr_p;
+	data_pending = txq->data_pending;
+
+	for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+		pkt = tx_pkts[simple_idx];
+		if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		simple_count++;
+		if (!txq->simple_always)
+			nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, &simple_flag,
+					&pending_flag, &data_pending, &wr_p, simple_close);
+	}
+
+	return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+		struct rte_mbuf **tx_pkts,
+		uint16_t head,
+		uint16_t nb_pkts)
+{
+	uint16_t others_idx;
+	struct rte_mbuf *pkt;
+	uint16_t others_count = 0;
+
+	for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+		pkt = tx_pkts[others_idx];
+		if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+			break;
+
+		others_count++;
+	}
+
+	return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t i;
+	uint16_t avail = 0;
+	uint16_t simple_close;
+	uint16_t simple_count;
+	uint16_t simple_avail;
+	uint16_t others_count;
+	uint16_t others_avail;
+	struct nfp_net_txq *txq = tx_queue;
+
+	for (i = 0; i < nb_pkts; i++) {
+		simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, tx_pkts, i,
+				nb_pkts, &simple_close);
+		if (simple_count > 0) {
+			if (!txq->simple_always)
+				txq->simple_always = true;
+
+			simple_avail = nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+					tx_pkts + i, simple_count, simple_close,
+					false);
+
+			avail += simple_avail;
+			if (simple_avail != simple_count)
+				break;
+
+			i += simple_count;
+		}
+
+		if (i == nb_pkts)
+			break;
+
+		others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, tx_pkts,
+				i, nb_pkts);
+
+		if (txq->simple_always)
+			txq->simple_always = false;
+
+		others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+				tx_pkts + i, others_count, false);
+
+		avail += others_avail;
+		if (others_avail != others_count)
+			break;
+
+		i += others_count;
+	}
+
+	return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+		struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+		__rte_unused struct rte_mbuf **tx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
 
 /* 64-bit per app capabilities */
 #define NFP_NET_APP_CAP_SP_INDIFF       RTE_BIT64(0) /* Indifferent to port speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
 
 #include "nfp_logs.h"
 #include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
 
 #define NFP_VF_DRIVER_NAME net_nfp_vf
 
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 	if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
 		eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
 	else
-		eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+		nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
 	/** Used by NFDk only */
 	uint16_t data_pending;
 
+	/** Used by NFDk vector xmit only */
+	bool simple_always;
+
 	/**
 	 * At this point 58 bytes have been used for all the fields in the
-	 * TX critical path. We have room for 6 bytes and still all placed
+	 * TX critical path. We have room for 5 bytes and still all placed
 	 * in a cache line.
 	 */
 	uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+		return true;
+
+	return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+	return false;
+}
-- 
2.39.1


  parent reply	other threads:[~2024-07-09  7:29 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
2024-06-19  2:59 ` [PATCH 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-06-19  2:59 ` [PATCH 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-06-19  2:59 ` [PATCH 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-06-19  2:59 ` [PATCH 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-06 18:51 ` [PATCH 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-08  5:52   ` Chaoyong He
2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
2024-07-08  5:58   ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-08  5:58   ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-08  5:58   ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-08  5:58   ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-08 11:45   ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-09  1:13     ` Chaoyong He
2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
2024-07-09  7:29     ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09  7:29     ` Chaoyong He [this message]
2024-07-09  7:29     ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-09  7:29     ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
2024-07-09  8:24       ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09  8:24       ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 4/5] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09 13:06       ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240709072921.246520-3-chaoyong.he@corigine.com \
    --to=chaoyong.he@corigine.com \
    --cc=dev@dpdk.org \
    --cc=long.wu@corigine.com \
    --cc=oss-drivers@corigine.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).