DPDK patches and discussions
 help / color / mirror / Atom feed
From: Chaoyong He <chaoyong.he@corigine.com>
To: dev@dpdk.org
Cc: oss-drivers@corigine.com, Long Wu <long.wu@corigine.com>,
	Peng Zhang <peng.zhang@corigine.com>,
	Chaoyong He <chaoyong.he@corigine.com>
Subject: [PATCH v4 4/5] net/nfp: support AVX2 Rx function
Date: Tue,  9 Jul 2024 16:24:04 +0800	[thread overview]
Message-ID: <20240709082405.248641-5-chaoyong.he@corigine.com> (raw)
In-Reply-To: <20240709082405.248641-1-chaoyong.he@corigine.com>

From: Long Wu <long.wu@corigine.com>

Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.

Signed-off-by: Peng Zhang <peng.zhang@corigine.com>
Signed-off-by: Long Wu <long.wu@corigine.com>
Reviewed-by: Chaoyong He <chaoyong.he@corigine.com>
---
 drivers/net/nfp/nfp_ethdev.c        |   2 +-
 drivers/net/nfp/nfp_ethdev_vf.c     |   2 +-
 drivers/net/nfp/nfp_net_meta.c      |   1 +
 drivers/net/nfp/nfp_rxtx.c          |  10 ++
 drivers/net/nfp/nfp_rxtx.h          |   1 +
 drivers/net/nfp/nfp_rxtx_vec.h      |   4 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c |   9 +
 8 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_net_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 
 	eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
 	eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-	eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+	nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
 		rte_be32_t meta_header,
 		struct nfp_net_meta_parsed *meta)
 {
+	meta->flags = 0;
 	meta->flags |= (1 << NFP_NET_META_HASH);
 	meta->hash_type = rte_be_to_cpu_32(meta_header);
 	meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
 
 /*
  * The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 	info->conf.offloads = dev_info.tx_offload_capa &
 			dev->data->dev_conf.txmode.offloads;
 }
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+	if (nfp_net_get_avx2_supported())
+		eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+	else
+		eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
 void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
 		uint16_t queue_id,
 		struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
 
 bool nfp_net_get_avx2_supported(void);
 
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
 
 #include <stdbool.h>
 
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
 #include <rte_cpuflags.h>
 #include <rte_vect.h>
 
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
 #include "nfp_rxtx_vec.h"
 
 bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
 
 	return false;
 }
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	_mm_storel_epi64((void *)rxds, vaddr0);
+
+	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb)
+{
+	__m128i dma;
+	__m128i dma_hi;
+	__m128i vaddr0;
+	__m128i vaddr1;
+	__m128i vaddr2;
+	__m128i vaddr3;
+	__m128i vaddr0_1;
+	__m128i vaddr2_3;
+	__m256i vaddr0_3;
+	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
+	dma_hi = _mm_srli_epi64(dma, 32);
+	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+			vaddr2_3, 1);
+
+	_mm256_store_si256((void *)rxds, vaddr0_3);
+
+	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rx_pkt)
+{
+	struct nfp_net_hw *hw = rxq->hw;
+	struct nfp_net_meta_parsed meta;
+
+	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+	/* Size of the whole packet. We just support 1 segment */
+	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+	/* Filling the received mbuf with packet info */
+	if (hw->rx_offset)
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+	else
+		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
+
+	rx_pkt->port = rxq->port_id;
+	rx_pkt->nb_segs = 1;
+	rx_pkt->next = NULL;
+
+	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+	/* Checking the checksum flag */
+	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf *rxb,
+		struct rte_mbuf *rx_pkt)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+	nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+		struct nfp_net_rx_desc *rxds,
+		struct rte_mbuf **rxb,
+		struct rte_mbuf **rx_pkts)
+{
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+	__m256i data = _mm256_loadu_si256((void *)rxds);
+
+	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
+			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+		return false;
+
+	return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+		struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t avail;
+	uint16_t nb_hold;
+	bool burst_receive;
+	struct rte_mbuf **rxb;
+	struct nfp_net_rx_desc *rxds;
+	struct nfp_net_rxq *rxq = rx_queue;
+
+	if (unlikely(rxq == NULL)) {
+		PMD_RX_LOG(ERR, "RX Bad queue");
+		return 0;
+	}
+
+	avail = 0;
+	nb_hold = 0;
+	burst_receive = true;
+	while (avail < nb_pkts) {
+		rxds = &rxq->rxds[rxq->rd_p];
+		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+				& PCIE_DESC_RX_DD) == 0)
+			goto recv_end;
+
+		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+		if ((rxq->rd_p & 0x3) == 0) {
+			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+		}
+
+		if ((rxq->rd_p & 0x7) == 0) {
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+		}
+
+		/*
+		 * If can not receive burst, just receive one.
+		 * 1. Rx ring will coming to the tail.
+		 * 2. Do not need to receive 4 packets.
+		 * 3. If pointer address unaligned on 32-bit boundary.
+		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+		 */
+		if ((rxq->rx_count - rxq->rd_p) < 4 ||
+				(nb_pkts - avail) < 4 ||
+				((uintptr_t)rxds & 0x1F) != 0 ||
+				!burst_receive) {
+			_mm_storel_epi64((void *)&rx_pkts[avail],
+					_mm_loadu_si128((void *)rxb));
+
+			/* Allocate a new mbuf into the software ring. */
+			if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+				PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu",
+						rxq->port_id, rxq->qidx);
+				nfp_net_mbuf_alloc_failed(rxq);
+				goto recv_end;
+			}
+
+			nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+			avail++;
+			nb_hold++;
+			continue;
+		}
+
+		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+		if (!burst_receive)
+			continue;
+
+		_mm256_storeu_si256((void *)&rx_pkts[avail],
+				_mm256_loadu_si256((void *)rxb));
+
+		/* Allocate 4 new mbufs into the software ring. */
+		if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+			burst_receive = false;
+			continue;
+		}
+
+		nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+		avail += 4;
+		nb_hold += 4;
+	}
+
+recv_end:
+	if (nb_hold == 0)
+		return nb_hold;
+
+	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+	nb_hold += rxq->nb_rx_hold;
+
+	/*
+	 * FL descriptors needs to be written before incrementing the
+	 * FL queue WR pointer
+	 */
+	rte_wmb();
+	if (nb_hold > rxq->rx_free_thresh) {
+		PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+				rxq->port_id, rxq->qidx, nb_hold, avail);
+		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+		nb_hold = 0;
+	}
+	rxq->nb_rx_hold = nb_hold;
+
+	return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 
 #include <rte_common.h>
+#include <rte_mbuf_core.h>
 
 #include "nfp_rxtx_vec.h"
 
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
 {
 	return false;
 }
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+		__rte_unused struct rte_mbuf **rx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
-- 
2.39.1


  parent reply	other threads:[~2024-07-09  8:24 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-19  2:59 [PATCH 0/4] support AVX2 instruction Rx/Tx function Chaoyong He
2024-06-19  2:59 ` [PATCH 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-06-19  2:59 ` [PATCH 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-06-19  2:59 ` [PATCH 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-06-19  2:59 ` [PATCH 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-06 18:51 ` [PATCH 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-08  5:52   ` Chaoyong He
2024-07-08  5:58 ` [PATCH v2 " Chaoyong He
2024-07-08  5:58   ` [PATCH v2 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-08  5:58   ` [PATCH v2 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-08  5:58   ` [PATCH v2 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-08  5:58   ` [PATCH v2 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-08 11:45   ` [PATCH v2 0/4] support AVX2 instruction Rx/Tx function Ferruh Yigit
2024-07-09  1:13     ` Chaoyong He
2024-07-09  7:29   ` [PATCH v3 " Chaoyong He
2024-07-09  7:29     ` [PATCH v3 1/4] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09  7:29     ` [PATCH v3 2/4] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-09  7:29     ` [PATCH v3 3/4] net/nfp: support AVX2 Rx function Chaoyong He
2024-07-09  7:29     ` [PATCH v3 4/4] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09  8:24     ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Chaoyong He
2024-07-09  8:24       ` [PATCH v4 1/5] net/nfp: fix compile fail on 32-bit OS Chaoyong He
2024-07-09  8:24       ` [PATCH v4 2/5] net/nfp: export more interfaces of NFDk Chaoyong He
2024-07-09  8:24       ` [PATCH v4 3/5] net/nfp: support AVX2 Tx function Chaoyong He
2024-07-09  8:24       ` Chaoyong He [this message]
2024-07-09  8:24       ` [PATCH v4 5/5] net/nfp: vector Rx function supports parsing ptype Chaoyong He
2024-07-09 13:06       ` [PATCH v4 0/5] support AVX2 instruction Rx/Tx function Ferruh Yigit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240709082405.248641-5-chaoyong.he@corigine.com \
    --to=chaoyong.he@corigine.com \
    --cc=dev@dpdk.org \
    --cc=long.wu@corigine.com \
    --cc=oss-drivers@corigine.com \
    --cc=peng.zhang@corigine.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).