DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512
@ 2021-03-17  6:48 Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
                   ` (5 more replies)
  0 siblings, 6 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-17  6:48 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like,
checksum, VLAN, RSS offload.
These paths are chosen automatically according to the
configuration.

The code for the above HW offload features, which are
supported by offload paths, is removed from the legacy path.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |    7 +
 drivers/net/iavf/iavf_rxtx.c            |  156 ++-
 drivers/net/iavf/iavf_rxtx.h            |   32 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1646 ++++++++++++++++++++++++++-----
 drivers/net/iavf/iavf_rxtx_vec_common.h |  113 ++-
 5 files changed, 1661 insertions(+), 293 deletions(-)

-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 1/4] net/iavf: store offload flag of Rx queue
  2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
@ 2021-03-17  6:48 ` Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-17  6:48 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c | 4 ++++
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 8fafe45..bf1064d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -498,9 +498,12 @@
 	uint8_t proto_xtr;
 	uint16_t len;
 	uint16_t rx_free_thresh;
+	uint64_t offloads;
 
 	PMD_INIT_FUNC_TRACE();
 
+	offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
 	if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
 	    nb_desc > IAVF_MAX_RING_DESC ||
 	    nb_desc < IAVF_MIN_RING_DESC) {
@@ -571,6 +574,7 @@
 	rxq->rx_deferred_start = rx_conf->rx_deferred_start;
 	rxq->rx_hdr_len = 0;
 	rxq->vsi = vsi;
+	rxq->offloads = offloads;
 
 	if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
 		rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 922ddad..06ff528 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
 		/* flexible descriptor metadata extraction offload flag */
 	iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
 				/* handle flexible descriptor by RXDID */
+	uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 2/4] net/iavf: add offload path for Tx AVX512
  2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
@ 2021-03-17  6:48 ` Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-17  6:48 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  50 ++++++----
 drivers/net/iavf/iavf_rxtx.h            |  13 ++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 165 ++++++++++++++++++++++++++++++++
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 +++++++++++++++++--
 4 files changed, 301 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bf1064d..4744c35 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -135,7 +135,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2473,17 +2473,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2491,15 +2497,24 @@
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH)
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+			else
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2521,6 +2536,7 @@
 	}
 #endif
 
+normal:
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 06ff528..da39f78 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,20 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -487,6 +493,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 5cb4c7c..2891a1a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1706,3 +1706,168 @@
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+static inline void
+iavf_vtx1_offload(volatile struct iavf_tx_desc *txdp,
+		  struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		(IAVF_TX_DESC_DTYPE_DATA |
+		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
+		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	iavf_txd_enable_offload(pkt, &high_qw);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+					    pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+iavf_vtx_offload(volatile struct iavf_tx_desc *txdp,
+		 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
+			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		iavf_vtx1_offload(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do 4 at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
+			_mm512_set_epi64
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		iavf_vtx1_offload(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static inline uint16_t
+iavf_xmit_fixed_burst_vec_avx512_offload(void *tx_queue,
+					 struct rte_mbuf **tx_pkts,
+					 uint16_t nb_pkts)
+{
+	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
+	volatile struct iavf_tx_desc *txdp;
+	struct iavf_tx_vec_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	/* bit2 is reserved and must be set to 1 according to Spec */
+	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
+	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
+
+	if (txq->nb_free < txq->free_thresh)
+		iavf_tx_free_bufs_avx512(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
+
+	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry_avx512(txep, tx_pkts, n);
+
+		iavf_vtx_offload(txdp, tx_pkts, n - 1, flags);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		iavf_vtx1_offload(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+		txep += tx_id;
+	}
+
+	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
+
+	iavf_vtx_offload(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->next_rs) {
+		txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
+					 IAVF_TXD_QW1_CMD_SHIFT);
+		txq->next_rs =
+			(uint16_t)(txq->next_rs + txq->rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IAVF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
+		ret = iavf_xmit_fixed_burst_vec_avx512_offload(tx_queue,
+							       &tx_pkts[nb_tx],
+							       num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 46a1873..54227b6 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -236,14 +236,17 @@
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -266,14 +269,97 @@
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		else if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #endif
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 3/4] net/iavf: add offload path for Rx AVX512
  2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
@ 2021-03-17  6:48 ` Wenzhuo Lu
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-17  6:48 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  81 ++--
 drivers/net/iavf/iavf_rxtx.h            |  12 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 722 ++++++++++++++++++++++++++------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  15 +-
 4 files changed, 682 insertions(+), 148 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4744c35..741c3cd 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2367,22 +2367,25 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_rx_queue *rxq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
 #ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
 #endif
+	bool use_flex = false;
 
-	if (!iavf_rx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		for (i = 0; i < dev->data->nb_rx_queues; i++) {
-			rxq = dev->data->rx_queues[i];
-			(void)iavf_rxq_vec_setup(rxq);
+	check_ret = iavf_rx_vec_dev_check(dev);
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2390,13 +2393,28 @@
 			use_avx512 = true;
 #endif
 
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (vf->vf_res->vf_cap_flags &
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			use_flex = true;
+			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+				use_flex = false;
+		}
+
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)iavf_rxq_vec_setup(rxq);
+		}
+
 		if (dev->data->scattered_rx) {
-			PMD_DRV_LOG(DEBUG,
-				    "Using %sVector Scattered Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512)
+				PMD_DRV_LOG(DEBUG,
+					    "Using %sVector Scattered Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2410,17 +2428,22 @@
 					iavf_recv_scattered_pkts_vec_avx2 :
 					iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		} else {
-			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512)
+				PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
@@ -2434,9 +2457,14 @@
 					iavf_recv_pkts_vec_avx2 :
 					iavf_recv_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		}
@@ -2445,6 +2473,7 @@
 	}
 #endif
 
+normal:
 	if (dev->data->scattered_rx) {
 		PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
 			    dev->data->port_id);
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index da39f78..b8c90f8 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -34,6 +34,12 @@
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_RX_VECTOR_OFFLOAD (				 \
+		DEV_RX_OFFLOAD_CHECKSUM |		 \
+		DEV_RX_OFFLOAD_SCTP_CKSUM |		 \
+		DEV_RX_OFFLOAD_VLAN |		 \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
 #define IAVF_VECTOR_PATH 0
 #define IAVF_VECTOR_OFFLOAD_PATH 1
 
@@ -482,12 +488,18 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 2891a1a..a8338c1 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -13,6 +13,22 @@
 #define IAVF_DESCS_PER_LOOP_AVX 8
 #define PKTLEN_SHIFT 10
 
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in RX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ stripping
+ * 3, RSS hash
+ * 4, packet type analysis
+ * 5, flow director ID report
+ ******************************************************************************/
+#define IAVF_RX_CSUM_OFFLOAD
+#define IAVF_RX_VLAN_OFFLOAD
+#define IAVF_RX_RSS_OFFLOAD
+#define IAVF_RX_PTYPE_OFFLOAD
+#define IAVF_RX_FDIR_OFFLOAD
+
 static inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
@@ -146,7 +162,9 @@
 			       struct rte_mbuf **rx_pkts,
 			       uint16_t nb_pkts, uint8_t *split_packet)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -249,71 +267,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except RSS, flow director and VLAN flags
-	 * bit2 is for VLAN tag, bit11 for flow director indication
-	 * bit13:12 for RSS indication. Bits 3-5 of error
-	 * field (bits 22-24) are for IP/L4 checksum errors
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((1 << 2) | (1 << 11) |
-				  (3 << 12) | (7 << 22));
-	/**
-	 * data to be shuffled by result of flag mask. If VLAN bit is set,
-	 * (bit 2), then position 4 in this array will be used in the
-	 * destination
-	 */
-	const __m256i vlan_flags_shuf =
-		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
-				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 11.
-	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
-	 * place.
-	 */
-	const __m256i rss_flags_shuf =
-		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
-				0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0);
-
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 22
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
-
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -381,6 +334,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, shift 64-bit values down 30 bits
 		 * and so ptype is in lower 8-bits in each
@@ -399,6 +353,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, adjusting length and
@@ -412,6 +367,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/* get the packet types */
 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
@@ -427,6 +383,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -446,27 +403,6 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/* set vlan and rss flags */
-		const __m256i vlan_flags =
-			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags =
-			_mm256_shuffle_epi8(rss_flags_shuf,
-					    _mm256_srli_epi32(flag_bits, 11));
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-						_mm256_srli_epi32(flag_bits, 22));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-
-		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-				_mm256_or_si256(rss_flags, vlan_flags));
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -493,21 +429,11 @@
 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-		rearm6 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 8),
-					    0x04);
-		rearm4 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 4),
-					    0x04);
-		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
-		rearm0 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(mbuf_flags, 4),
-					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
-		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
-		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
-		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
-		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* permute to add in the rx_descriptor */
+		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
 		/* write to mbuf */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 				    rearm6);
@@ -519,24 +445,10 @@
 				    rearm0);
 
 		/* repeat for the odd mbufs */
-		const __m256i odd_flags =
-			_mm256_castsi128_si256
-				(_mm256_extracti128_si256(mbuf_flags, 1));
-		rearm7 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 8),
-					    0x04);
-		rearm5 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 4),
-					    0x04);
-		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
-		rearm1 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(odd_flags, 4),
-					    0x04);
-		/* since odd mbufs are already in hi 128-bits use blend */
-		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
-		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
-		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
-		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
 		/* again write to mbufs */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 				    rearm7);
@@ -1397,6 +1309,578 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_offload(struct iavf_rx_queue *rxq,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts, uint8_t *split_packet)
+{
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+						    rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+		iavf_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+	      rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 2nd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 3rd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 4th descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore pkt_type field */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			IAVF_RX_DESC_STATUS_EOF_SHIFT);
+
+	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 2nd descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 3rd descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 4th descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF     /* pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication. Bits 3-5 of error
+	 * field (bits 22-24) are for IP/L4 checksum errors
+	 */
+	const __m256i flags_mask =
+		_mm256_set1_epi32((1 << 2) | (1 << 11) |
+				  (3 << 12) | (7 << 22));
+#endif
+
+#ifdef IAVF_RX_VLAN_OFFLOAD
+	/**
+	 * data to be shuffled by result of flag mask. If VLAN bit is set,
+	 * (bit 2), then position 4 in this array will be used in the
+	 * destination
+	 */
+	const __m256i vlan_flags_shuf =
+		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+#endif
+
+#ifdef IAVF_RX_RSS_OFFLOAD
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 11.
+	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+	 * place.
+	 */
+	const __m256i rss_flags_shuf =
+		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
+				0, 0, 0, 0, 0, 0, 0, 0,
+				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+				0, 0, 0, 0, PKT_RX_FDIR, 0);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 22
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+			/* shift right 1 bit to make sure it not exceed 255 */
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+			 PKT_RX_L4_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+			PKT_RX_IP_CKSUM_BAD >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+			/* second 128-bits */
+			0, 0, 0, 0, 0, 0, 0, 0,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+			 PKT_RX_L4_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+			PKT_RX_IP_CKSUM_BAD >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+	const __m256i cksum_mask =
+		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+				  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IAVF_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
+		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
+							 PKTLEN_SHIFT);
+		const __m512i desc4_7 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
+								raw_desc4_7,
+								len4_7);
+		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
+
+		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/**
+		 * to get packet types, shift 64-bit values down 30 bits
+		 * and so ptype is in lower 8-bits in each
+		 */
+		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
+		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
+		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
+		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
+		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype7],
+			 0, 0, 0, type_table[ptype6],
+			 0, 0, 0, type_table[ptype5],
+			 0, 0, 0, type_table[ptype4]);
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
+
+		/**
+		 * convert descriptors 0-3 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
+							 PKTLEN_SHIFT);
+		const __m512i desc0_3 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
+								raw_desc0_3,
+								len0_3);
+		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
+
+		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/* get the packet types */
+		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
+		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
+		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
+		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
+		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
+
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype3],
+			 0, 0, 0, type_table[ptype2],
+			 0, 0, 0, type_table[ptype1],
+			 0, 0, 0, type_table[ptype0]);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(raw_desc4_7, status_permute_msk, raw_desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+#endif
+		/* set vlan and rss flags */
+#ifdef IAVF_RX_VLAN_OFFLOAD
+		const __m256i vlan_flags =
+			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+		const __m256i rss_flags =
+			_mm256_shuffle_epi8(rss_flags_shuf,
+					    _mm256_srli_epi32(flag_bits, 11));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+						_mm256_srli_epi32(flag_bits, 22));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
+#endif
+#ifdef IAVF_RX_VLAN_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
+#endif
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+						    16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+							      eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+uint16_t
+iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_offload(rx_queue, rx_pkts,
+						      nb_pkts, NULL);
+}
+
+static uint16_t
+iavf_recv_scattered_burst_vec_avx512_offload(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_offload(rxq, rx_pkts,
+								  nb_pkts,
+								  split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+				      &split_flags[i]);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst =
+			iavf_recv_scattered_burst_vec_avx512_offload(rx_queue,
+				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+	return retval + iavf_recv_scattered_burst_vec_avx512_offload(rx_queue,
+				rx_pkts + retval, nb_pkts);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 54227b6..68f03c8 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -227,7 +227,10 @@
 	if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE)
 		return -1;
 
-	return 0;
+	if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -254,14 +257,20 @@
 {
 	int i;
 	struct iavf_rx_queue *rxq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		if (iavf_rx_vec_queue_default(rxq))
+		ret = iavf_rx_vec_queue_default(rxq);
+
+		if (ret < 0)
 			return -1;
+		else if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
 }
 
 static inline int
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
                   ` (2 preceding siblings ...)
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
@ 2021-03-17  6:48 ` Wenzhuo Lu
  2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-17  6:48 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c            |  27 +-
 drivers/net/iavf/iavf_rxtx.h            |   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +++++++++++++++++++++++++-------
 4 files changed, 746 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 21dc6d2..a7769f8 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -83,6 +83,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+    offload features are configured to be used, the offload paths are chosen
+    automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 741c3cd..a39c110 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2397,11 +2397,8 @@
 			goto normal;
 
 		if (vf->vf_res->vf_cap_flags &
-			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
 			use_flex = true;
-			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-				use_flex = false;
-		}
 
 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
 			rxq = dev->data->rx_queues[i];
@@ -2419,9 +2416,14 @@
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
@@ -2448,9 +2450,14 @@
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index b8c90f8..42e9f79 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -494,6 +494,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						    struct rte_mbuf **rx_pkts,
+						    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
@@ -503,6 +506,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+							      struct rte_mbuf **rx_pkts,
+							      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index a8338c1..2dd8c65 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -550,7 +550,9 @@
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts, uint8_t *split_packet)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -654,71 +656,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except Checksum Reports, RSS indication
-	 * and VLAN indication.
-	 * bit6:4 for IP/L4 checksum errors.
-	 * bit12 is for RSS indication.
-	 * bit13 is for VLAN indication.
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 4
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 12.
-	 * If RSS(bit12)/VLAN(bit13) are set,
-	 * shuffle moves appropriate flags in place.
-	 */
-	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0,
-			/* end up 128-bits */
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -782,6 +719,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -803,6 +741,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
@@ -811,6 +750,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -830,6 +770,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -847,30 +788,9 @@
 		__m256i status0_7 = _mm512_extracti64x4_epi64
 			(raw_status0_7, 0);
 
-		/* now do flag manipulation */
-
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-				_mm256_srli_epi32(flag_bits, 4));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-		/* set rss and vlan flags */
-		const __m256i rss_vlan_flag_bits =
-			_mm256_srli_epi32(flag_bits, 12);
-		const __m256i rss_vlan_flags =
-			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
-					    rss_vlan_flag_bits);
-
-		/* merge flags */
-		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-						     rss_vlan_flags);
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
 
+#ifdef IAVF_RX_FDIR_OFFLOAD
 		if (rxq->fdir_enabled) {
 			const __m512i fdir_permute_mask = _mm512_set_epi32
 				(0, 0, 0, 0,
@@ -881,12 +801,9 @@
 				(raw_desc0_3, fdir_permute_mask, raw_desc4_7);
 			const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
 				(fdir_tmp, 0);
-			const __m256i fdir_flags =
+			mbuf_flags =
 				flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
 
-			/* merge with fdir_flags */
-			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
-
 			/* write to mbuf: have to use scalar store here */
 			rx_pkts[i + 0]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 3);
@@ -912,102 +829,13 @@
 			rx_pkts[i + 7]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 4);
 		} /* if() on fdir_enabled */
+#endif
 
 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
 		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-		/**
-		 * needs to load 2nd 16B of each desc for RSS hash parsing,
-		 * will cause performance drop to get into this context.
-		 */
-		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
-		    DEV_RX_OFFLOAD_RSS_HASH) {
-			/* load bottom half of every 32B desc */
-			const __m128i raw_desc_bh7 =
-				_mm_load_si128
-					((void *)(&rxdp[7].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh6 =
-				_mm_load_si128
-					((void *)(&rxdp[6].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh5 =
-				_mm_load_si128
-					((void *)(&rxdp[5].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh4 =
-				_mm_load_si128
-					((void *)(&rxdp[4].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh3 =
-				_mm_load_si128
-					((void *)(&rxdp[3].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh2 =
-				_mm_load_si128
-					((void *)(&rxdp[2].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh1 =
-				_mm_load_si128
-					((void *)(&rxdp[1].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh0 =
-				_mm_load_si128
-					((void *)(&rxdp[0].wb.status_error1));
-
-			__m256i raw_desc_bh6_7 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh6),
-					 raw_desc_bh7, 1);
-			__m256i raw_desc_bh4_5 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh4),
-					 raw_desc_bh5, 1);
-			__m256i raw_desc_bh2_3 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh2),
-					 raw_desc_bh3, 1);
-			__m256i raw_desc_bh0_1 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh0),
-					 raw_desc_bh1, 1);
-
-			/**
-			 * to shift the 32b RSS hash value to the
-			 * highest 32b of each 128b before mask
-			 */
-			__m256i rss_hash6_7 =
-				_mm256_slli_epi64(raw_desc_bh6_7, 32);
-			__m256i rss_hash4_5 =
-				_mm256_slli_epi64(raw_desc_bh4_5, 32);
-			__m256i rss_hash2_3 =
-				_mm256_slli_epi64(raw_desc_bh2_3, 32);
-			__m256i rss_hash0_1 =
-				_mm256_slli_epi64(raw_desc_bh0_1, 32);
-
-			__m256i rss_hash_msk =
-				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
-						 0xFFFFFFFF, 0, 0, 0);
-
-			rss_hash6_7 = _mm256_and_si256
-					(rss_hash6_7, rss_hash_msk);
-			rss_hash4_5 = _mm256_and_si256
-					(rss_hash4_5, rss_hash_msk);
-			rss_hash2_3 = _mm256_and_si256
-					(rss_hash2_3, rss_hash_msk);
-			rss_hash0_1 = _mm256_and_si256
-					(rss_hash0_1, rss_hash_msk);
-
-			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
-			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
-			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
-			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
-		} /* if() on RSS hash parsing */
-#endif
-
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -1039,7 +867,7 @@
 		rearm0 = _mm256_blend_epi32(mbuf_init,
 					    _mm256_srli_si256(mbuf_flags, 4),
 					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
+		/* permute to add in the rx_descriptor */
 		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
 		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
 		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
@@ -1881,6 +1709,711 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd_offload(struct iavf_rx_queue *rxq,
+						struct rte_mbuf **rx_pkts,
+						uint16_t nb_pkts,
+						uint8_t *split_packet)
+{
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+						    rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union iavf_rx_flex_desc *rxdp =
+		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+		iavf_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.status_error0 &
+	      rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 2nd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 3rd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 4th descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore pkt_type field */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
+
+	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 2nd descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 3rd descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 4th descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF     /* pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except Checksum Reports, RSS indication
+	 * and VLAN indication.
+	 * bit6:4 for IP/L4 checksum errors.
+	 * bit12 is for RSS indication.
+	 * bit13 is for VLAN indication.
+	 */
+	const __m256i flags_mask =
+		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 4
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+			/* shift right 1 bit to make sure it not exceed 255 */
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			/* second 128-bits */
+			0, 0, 0, 0, 0, 0, 0, 0,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
+	const __m256i cksum_mask =
+		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+				  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 12.
+	 * If RSS(bit12)/VLAN(bit13) are set,
+	 * shuffle moves appropriate flags in place.
+	 */
+	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_RSS_HASH, 0,
+			/* end up 128-bits */
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_RSS_HASH, 0);
+#endif
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IAVF_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
+		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
+
+		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m512i ptype_mask =
+			_mm512_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
+		const __m512i ptypes4_7 =
+			_mm512_and_si512(raw_desc4_7, ptype_mask);
+		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype7],
+			 0, 0, 0, type_table[ptype6],
+			 0, 0, 0, type_table[ptype5],
+			 0, 0, 0, type_table[ptype4]);
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
+
+		/**
+		 * convert descriptors 0-3 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
+
+		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m512i ptypes0_3 =
+			_mm512_and_si512(raw_desc0_3, ptype_mask);
+		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
+
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype3],
+			 0, 0, 0, type_table[ptype2],
+			 0, 0, 0, type_table[ptype1],
+			 0, 0, 0, type_table[ptype0]);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(raw_desc4_7, status_permute_msk, raw_desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 4));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		/* set rss and vlan flags */
+		const __m256i rss_vlan_flag_bits =
+			_mm256_srli_epi32(flag_bits, 12);
+		const __m256i rss_vlan_flags =
+			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+					    rss_vlan_flag_bits);
+#endif
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags);
+#endif
+
+#ifdef IAVF_RX_FDIR_OFFLOAD
+		if (rxq->fdir_enabled) {
+			const __m512i fdir_permute_mask = _mm512_set_epi32
+				(0, 0, 0, 0,
+				 0, 0, 0, 0,
+				 7, 15, 23, 31,
+				 3, 11, 19, 27);
+			__m512i fdir_tmp = _mm512_permutex2var_epi32
+				(raw_desc0_3, fdir_permute_mask, raw_desc4_7);
+			const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
+				(fdir_tmp, 0);
+			const __m256i fdir_flags =
+				flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
+
+			/* merge with fdir_flags */
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
+
+			/* write to mbuf: have to use scalar store here */
+			rx_pkts[i + 0]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 3);
+
+			rx_pkts[i + 1]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 7);
+
+			rx_pkts[i + 2]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 2);
+
+			rx_pkts[i + 3]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 6);
+
+			rx_pkts[i + 4]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 1);
+
+			rx_pkts[i + 5]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 5);
+
+			rx_pkts[i + 6]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 0);
+
+			rx_pkts[i + 7]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 4);
+		} /* if() on fdir_enabled */
+#endif
+
+		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+#ifdef IAVF_RX_RSS_OFFLOAD
+		/**
+		 * needs to load 2nd 16B of each desc for RSS hash parsing,
+		 * will cause performance drop to get into this context.
+		 */
+		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+		    DEV_RX_OFFLOAD_RSS_HASH) {
+			/* load bottom half of every 32B desc */
+			const __m128i raw_desc_bh7 =
+				_mm_load_si128
+					((void *)(&rxdp[7].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh6 =
+				_mm_load_si128
+					((void *)(&rxdp[6].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh5 =
+				_mm_load_si128
+					((void *)(&rxdp[5].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh4 =
+				_mm_load_si128
+					((void *)(&rxdp[4].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh3 =
+				_mm_load_si128
+					((void *)(&rxdp[3].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh2 =
+				_mm_load_si128
+					((void *)(&rxdp[2].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh1 =
+				_mm_load_si128
+					((void *)(&rxdp[1].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh0 =
+				_mm_load_si128
+					((void *)(&rxdp[0].wb.status_error1));
+
+			__m256i raw_desc_bh6_7 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh6),
+					 raw_desc_bh7, 1);
+			__m256i raw_desc_bh4_5 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh4),
+					 raw_desc_bh5, 1);
+			__m256i raw_desc_bh2_3 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh2),
+					 raw_desc_bh3, 1);
+			__m256i raw_desc_bh0_1 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh0),
+					 raw_desc_bh1, 1);
+
+			/**
+			 * to shift the 32b RSS hash value to the
+			 * highest 32b of each 128b before mask
+			 */
+			__m256i rss_hash6_7 =
+				_mm256_slli_epi64(raw_desc_bh6_7, 32);
+			__m256i rss_hash4_5 =
+				_mm256_slli_epi64(raw_desc_bh4_5, 32);
+			__m256i rss_hash2_3 =
+				_mm256_slli_epi64(raw_desc_bh2_3, 32);
+			__m256i rss_hash0_1 =
+				_mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+			__m256i rss_hash_msk =
+				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+						 0xFFFFFFFF, 0, 0, 0);
+
+			rss_hash6_7 = _mm256_and_si256
+					(rss_hash6_7, rss_hash_msk);
+			rss_hash4_5 = _mm256_and_si256
+					(rss_hash4_5, rss_hash_msk);
+			rss_hash2_3 = _mm256_and_si256
+					(rss_hash2_3, rss_hash_msk);
+			rss_hash0_1 = _mm256_and_si256
+					(rss_hash0_1, rss_hash_msk);
+
+			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+		} /* if() on RSS hash parsing */
+#endif
+#endif
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+						    16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 <<
+					       IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+							      eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+uint16_t
+iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd_offload(rx_queue,
+							       rx_pkts,
+							       nb_pkts, NULL);
+}
+
+static uint16_t
+iavf_recv_scattered_burst_vec_avx512_flex_rxd_offload(void *rx_queue,
+						      struct rte_mbuf **rx_pkts,
+						      uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs =
+		_iavf_recv_raw_pkts_vec_avx512_flex_rxd_offload(rxq,
+								rx_pkts, nb_pkts,
+								split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+				      &split_flags[i]);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst =
+			iavf_recv_scattered_burst_vec_avx512_flex_rxd_offload
+				(rx_queue, rx_pkts + retval,
+				 IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+	return retval +
+		iavf_recv_scattered_burst_vec_avx512_flex_rxd_offload(rx_queue,
+			rx_pkts + retval, nb_pkts);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512
  2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
                   ` (3 preceding siblings ...)
  2021-03-17  6:48 ` [dpdk-dev] [PATCH 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
@ 2021-03-18  5:24 ` Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
                     ` (3 more replies)
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  5 siblings, 4 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-18  5:24 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like,
checksum, VLAN, RSS offload.
These paths are chosen automatically according to the
configuration.

The code for the above HW offload features, which are
supported by offload paths, is removed from the legacy path.

v2:
 - Fixed compile error.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |    7 +
 drivers/net/iavf/iavf_rxtx.c            |  158 ++-
 drivers/net/iavf/iavf_rxtx.h            |   32 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1646 ++++++++++++++++++++++++++-----
 drivers/net/iavf/iavf_rxtx_vec_common.h |  113 ++-
 5 files changed, 1661 insertions(+), 295 deletions(-)

-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 1/4] net/iavf: store offload flag of Rx queue
  2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
@ 2021-03-18  5:24   ` Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-18  5:24 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c | 4 ++++
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 8fafe45..bf1064d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -498,9 +498,12 @@
 	uint8_t proto_xtr;
 	uint16_t len;
 	uint16_t rx_free_thresh;
+	uint64_t offloads;
 
 	PMD_INIT_FUNC_TRACE();
 
+	offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
 	if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
 	    nb_desc > IAVF_MAX_RING_DESC ||
 	    nb_desc < IAVF_MIN_RING_DESC) {
@@ -571,6 +574,7 @@
 	rxq->rx_deferred_start = rx_conf->rx_deferred_start;
 	rxq->rx_hdr_len = 0;
 	rxq->vsi = vsi;
+	rxq->offloads = offloads;
 
 	if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
 		rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 922ddad..06ff528 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
 		/* flexible descriptor metadata extraction offload flag */
 	iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
 				/* handle flexible descriptor by RXDID */
+	uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] net/iavf: add offload path for Tx AVX512
  2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
@ 2021-03-18  5:24   ` Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
  3 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-18  5:24 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  50 ++++++----
 drivers/net/iavf/iavf_rxtx.h            |  13 ++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 165 ++++++++++++++++++++++++++++++++
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 +++++++++++++++++--
 4 files changed, 301 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bf1064d..4744c35 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -135,7 +135,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2473,17 +2473,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2491,15 +2497,24 @@
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH)
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+			else
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2521,6 +2536,7 @@
 	}
 #endif
 
+normal:
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 06ff528..da39f78 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,20 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -487,6 +493,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 5cb4c7c..2891a1a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1706,3 +1706,168 @@
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+static inline void
+iavf_vtx1_offload(volatile struct iavf_tx_desc *txdp,
+		  struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		(IAVF_TX_DESC_DTYPE_DATA |
+		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
+		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	iavf_txd_enable_offload(pkt, &high_qw);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+					    pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+iavf_vtx_offload(volatile struct iavf_tx_desc *txdp,
+		 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
+			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		iavf_vtx1_offload(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do 4 at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
+			_mm512_set_epi64
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		iavf_vtx1_offload(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static inline uint16_t
+iavf_xmit_fixed_burst_vec_avx512_offload(void *tx_queue,
+					 struct rte_mbuf **tx_pkts,
+					 uint16_t nb_pkts)
+{
+	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
+	volatile struct iavf_tx_desc *txdp;
+	struct iavf_tx_vec_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	/* bit2 is reserved and must be set to 1 according to Spec */
+	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
+	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
+
+	if (txq->nb_free < txq->free_thresh)
+		iavf_tx_free_bufs_avx512(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
+
+	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry_avx512(txep, tx_pkts, n);
+
+		iavf_vtx_offload(txdp, tx_pkts, n - 1, flags);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		iavf_vtx1_offload(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+		txep += tx_id;
+	}
+
+	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
+
+	iavf_vtx_offload(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->next_rs) {
+		txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
+					 IAVF_TXD_QW1_CMD_SHIFT);
+		txq->next_rs =
+			(uint16_t)(txq->next_rs + txq->rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IAVF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
+		ret = iavf_xmit_fixed_burst_vec_avx512_offload(tx_queue,
+							       &tx_pkts[nb_tx],
+							       num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 46a1873..54227b6 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -236,14 +236,17 @@
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -266,14 +269,97 @@
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		else if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #endif
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] net/iavf: add offload path for Rx AVX512
  2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
@ 2021-03-18  5:24   ` Wenzhuo Lu
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
  3 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-18  5:24 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  83 ++--
 drivers/net/iavf/iavf_rxtx.h            |  12 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 722 ++++++++++++++++++++++++++------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  15 +-
 4 files changed, 682 insertions(+), 150 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4744c35..9b3f8be 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2367,22 +2367,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_rx_queue *rxq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
+	bool use_flex = false;
 
-	if (!iavf_rx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		for (i = 0; i < dev->data->nb_rx_queues; i++) {
-			rxq = dev->data->rx_queues[i];
-			(void)iavf_rxq_vec_setup(rxq);
+	check_ret = iavf_rx_vec_dev_check(dev);
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2390,13 +2391,28 @@
 			use_avx512 = true;
 #endif
 
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (vf->vf_res->vf_cap_flags &
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			use_flex = true;
+			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+				use_flex = false;
+		}
+
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)iavf_rxq_vec_setup(rxq);
+		}
+
 		if (dev->data->scattered_rx) {
-			PMD_DRV_LOG(DEBUG,
-				    "Using %sVector Scattered Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512)
+				PMD_DRV_LOG(DEBUG,
+					    "Using %sVector Scattered Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2410,17 +2426,22 @@
 					iavf_recv_scattered_pkts_vec_avx2 :
 					iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		} else {
-			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512)
+				PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
@@ -2434,9 +2455,14 @@
 					iavf_recv_pkts_vec_avx2 :
 					iavf_recv_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		}
@@ -2445,6 +2471,7 @@
 	}
 #endif
 
+normal:
 	if (dev->data->scattered_rx) {
 		PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
 			    dev->data->port_id);
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index da39f78..b8c90f8 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -34,6 +34,12 @@
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_RX_VECTOR_OFFLOAD (				 \
+		DEV_RX_OFFLOAD_CHECKSUM |		 \
+		DEV_RX_OFFLOAD_SCTP_CKSUM |		 \
+		DEV_RX_OFFLOAD_VLAN |		 \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
 #define IAVF_VECTOR_PATH 0
 #define IAVF_VECTOR_OFFLOAD_PATH 1
 
@@ -482,12 +488,18 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 2891a1a..a8338c1 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -13,6 +13,22 @@
 #define IAVF_DESCS_PER_LOOP_AVX 8
 #define PKTLEN_SHIFT 10
 
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in RX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ stripping
+ * 3, RSS hash
+ * 4, packet type analysis
+ * 5, flow director ID report
+ ******************************************************************************/
+#define IAVF_RX_CSUM_OFFLOAD
+#define IAVF_RX_VLAN_OFFLOAD
+#define IAVF_RX_RSS_OFFLOAD
+#define IAVF_RX_PTYPE_OFFLOAD
+#define IAVF_RX_FDIR_OFFLOAD
+
 static inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
@@ -146,7 +162,9 @@
 			       struct rte_mbuf **rx_pkts,
 			       uint16_t nb_pkts, uint8_t *split_packet)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -249,71 +267,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except RSS, flow director and VLAN flags
-	 * bit2 is for VLAN tag, bit11 for flow director indication
-	 * bit13:12 for RSS indication. Bits 3-5 of error
-	 * field (bits 22-24) are for IP/L4 checksum errors
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((1 << 2) | (1 << 11) |
-				  (3 << 12) | (7 << 22));
-	/**
-	 * data to be shuffled by result of flag mask. If VLAN bit is set,
-	 * (bit 2), then position 4 in this array will be used in the
-	 * destination
-	 */
-	const __m256i vlan_flags_shuf =
-		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
-				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 11.
-	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
-	 * place.
-	 */
-	const __m256i rss_flags_shuf =
-		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
-				0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0);
-
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 22
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
-
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -381,6 +334,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, shift 64-bit values down 30 bits
 		 * and so ptype is in lower 8-bits in each
@@ -399,6 +353,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, adjusting length and
@@ -412,6 +367,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/* get the packet types */
 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
@@ -427,6 +383,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -446,27 +403,6 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/* set vlan and rss flags */
-		const __m256i vlan_flags =
-			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags =
-			_mm256_shuffle_epi8(rss_flags_shuf,
-					    _mm256_srli_epi32(flag_bits, 11));
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-						_mm256_srli_epi32(flag_bits, 22));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-
-		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-				_mm256_or_si256(rss_flags, vlan_flags));
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -493,21 +429,11 @@
 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-		rearm6 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 8),
-					    0x04);
-		rearm4 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 4),
-					    0x04);
-		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
-		rearm0 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(mbuf_flags, 4),
-					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
-		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
-		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
-		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
-		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* permute to add in the rx_descriptor */
+		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
 		/* write to mbuf */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 				    rearm6);
@@ -519,24 +445,10 @@
 				    rearm0);
 
 		/* repeat for the odd mbufs */
-		const __m256i odd_flags =
-			_mm256_castsi128_si256
-				(_mm256_extracti128_si256(mbuf_flags, 1));
-		rearm7 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 8),
-					    0x04);
-		rearm5 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 4),
-					    0x04);
-		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
-		rearm1 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(odd_flags, 4),
-					    0x04);
-		/* since odd mbufs are already in hi 128-bits use blend */
-		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
-		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
-		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
-		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
 		/* again write to mbufs */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 				    rearm7);
@@ -1397,6 +1309,578 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_offload(struct iavf_rx_queue *rxq,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts, uint8_t *split_packet)
+{
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+						    rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+		iavf_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+	      rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 2nd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 3rd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 4th descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore pkt_type field */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			IAVF_RX_DESC_STATUS_EOF_SHIFT);
+
+	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 2nd descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 3rd descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 4th descriptor */
+			 0x07060504,    /* octet 4~7, 32bits rss */
+			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF     /* pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication. Bits 3-5 of error
+	 * field (bits 22-24) are for IP/L4 checksum errors
+	 */
+	const __m256i flags_mask =
+		_mm256_set1_epi32((1 << 2) | (1 << 11) |
+				  (3 << 12) | (7 << 22));
+#endif
+
+#ifdef IAVF_RX_VLAN_OFFLOAD
+	/**
+	 * data to be shuffled by result of flag mask. If VLAN bit is set,
+	 * (bit 2), then position 4 in this array will be used in the
+	 * destination
+	 */
+	const __m256i vlan_flags_shuf =
+		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+#endif
+
+#ifdef IAVF_RX_RSS_OFFLOAD
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 11.
+	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+	 * place.
+	 */
+	const __m256i rss_flags_shuf =
+		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
+				0, 0, 0, 0, 0, 0, 0, 0,
+				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+				0, 0, 0, 0, PKT_RX_FDIR, 0);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 22
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+			/* shift right 1 bit to make sure it not exceed 255 */
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+			 PKT_RX_L4_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+			PKT_RX_IP_CKSUM_BAD >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+			/* second 128-bits */
+			0, 0, 0, 0, 0, 0, 0, 0,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+			 PKT_RX_L4_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+			PKT_RX_IP_CKSUM_BAD >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+	const __m256i cksum_mask =
+		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+				  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IAVF_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
+		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
+							 PKTLEN_SHIFT);
+		const __m512i desc4_7 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
+								raw_desc4_7,
+								len4_7);
+		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
+
+		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/**
+		 * to get packet types, shift 64-bit values down 30 bits
+		 * and so ptype is in lower 8-bits in each
+		 */
+		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
+		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
+		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
+		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
+		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype7],
+			 0, 0, 0, type_table[ptype6],
+			 0, 0, 0, type_table[ptype5],
+			 0, 0, 0, type_table[ptype4]);
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
+
+		/**
+		 * convert descriptors 0-3 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
+							 PKTLEN_SHIFT);
+		const __m512i desc0_3 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
+								raw_desc0_3,
+								len0_3);
+		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
+
+		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/* get the packet types */
+		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
+		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
+		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
+		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
+		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
+
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype3],
+			 0, 0, 0, type_table[ptype2],
+			 0, 0, 0, type_table[ptype1],
+			 0, 0, 0, type_table[ptype0]);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(raw_desc4_7, status_permute_msk, raw_desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+#endif
+		/* set vlan and rss flags */
+#ifdef IAVF_RX_VLAN_OFFLOAD
+		const __m256i vlan_flags =
+			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+		const __m256i rss_flags =
+			_mm256_shuffle_epi8(rss_flags_shuf,
+					    _mm256_srli_epi32(flag_bits, 11));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+						_mm256_srli_epi32(flag_bits, 22));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
+#endif
+#ifdef IAVF_RX_VLAN_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
+#endif
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+						    16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+							      eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+uint16_t
+iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_offload(rx_queue, rx_pkts,
+						      nb_pkts, NULL);
+}
+
+static uint16_t
+iavf_recv_scattered_burst_vec_avx512_offload(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_offload(rxq, rx_pkts,
+								  nb_pkts,
+								  split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+				      &split_flags[i]);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst =
+			iavf_recv_scattered_burst_vec_avx512_offload(rx_queue,
+				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+	return retval + iavf_recv_scattered_burst_vec_avx512_offload(rx_queue,
+				rx_pkts + retval, nb_pkts);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 54227b6..68f03c8 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -227,7 +227,10 @@
 	if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE)
 		return -1;
 
-	return 0;
+	if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -254,14 +257,20 @@
 {
 	int i;
 	struct iavf_rx_queue *rxq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		if (iavf_rx_vec_queue_default(rxq))
+		ret = iavf_rx_vec_queue_default(rxq);
+
+		if (ret < 0)
 			return -1;
+		else if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
 }
 
 static inline int
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
                     ` (2 preceding siblings ...)
  2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
@ 2021-03-18  5:24   ` Wenzhuo Lu
  3 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-18  5:24 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c            |  27 +-
 drivers/net/iavf/iavf_rxtx.h            |   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +++++++++++++++++++++++++-------
 4 files changed, 746 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 21dc6d2..a7769f8 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -83,6 +83,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+    offload features are configured to be used, the offload paths are chosen
+    automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 9b3f8be..5aeb19d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2395,11 +2395,8 @@
 			goto normal;
 
 		if (vf->vf_res->vf_cap_flags &
-			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
 			use_flex = true;
-			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-				use_flex = false;
-		}
 
 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
 			rxq = dev->data->rx_queues[i];
@@ -2417,9 +2414,14 @@
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
@@ -2446,9 +2448,14 @@
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index b8c90f8..42e9f79 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -494,6 +494,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						    struct rte_mbuf **rx_pkts,
+						    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
@@ -503,6 +506,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+							      struct rte_mbuf **rx_pkts,
+							      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index a8338c1..2dd8c65 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -550,7 +550,9 @@
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts, uint8_t *split_packet)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -654,71 +656,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except Checksum Reports, RSS indication
-	 * and VLAN indication.
-	 * bit6:4 for IP/L4 checksum errors.
-	 * bit12 is for RSS indication.
-	 * bit13 is for VLAN indication.
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 4
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 12.
-	 * If RSS(bit12)/VLAN(bit13) are set,
-	 * shuffle moves appropriate flags in place.
-	 */
-	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0,
-			/* end up 128-bits */
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -782,6 +719,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -803,6 +741,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
@@ -811,6 +750,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -830,6 +770,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -847,30 +788,9 @@
 		__m256i status0_7 = _mm512_extracti64x4_epi64
 			(raw_status0_7, 0);
 
-		/* now do flag manipulation */
-
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-				_mm256_srli_epi32(flag_bits, 4));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-		/* set rss and vlan flags */
-		const __m256i rss_vlan_flag_bits =
-			_mm256_srli_epi32(flag_bits, 12);
-		const __m256i rss_vlan_flags =
-			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
-					    rss_vlan_flag_bits);
-
-		/* merge flags */
-		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-						     rss_vlan_flags);
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
 
+#ifdef IAVF_RX_FDIR_OFFLOAD
 		if (rxq->fdir_enabled) {
 			const __m512i fdir_permute_mask = _mm512_set_epi32
 				(0, 0, 0, 0,
@@ -881,12 +801,9 @@
 				(raw_desc0_3, fdir_permute_mask, raw_desc4_7);
 			const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
 				(fdir_tmp, 0);
-			const __m256i fdir_flags =
+			mbuf_flags =
 				flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
 
-			/* merge with fdir_flags */
-			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
-
 			/* write to mbuf: have to use scalar store here */
 			rx_pkts[i + 0]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 3);
@@ -912,102 +829,13 @@
 			rx_pkts[i + 7]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 4);
 		} /* if() on fdir_enabled */
+#endif
 
 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
 		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-		/**
-		 * needs to load 2nd 16B of each desc for RSS hash parsing,
-		 * will cause performance drop to get into this context.
-		 */
-		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
-		    DEV_RX_OFFLOAD_RSS_HASH) {
-			/* load bottom half of every 32B desc */
-			const __m128i raw_desc_bh7 =
-				_mm_load_si128
-					((void *)(&rxdp[7].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh6 =
-				_mm_load_si128
-					((void *)(&rxdp[6].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh5 =
-				_mm_load_si128
-					((void *)(&rxdp[5].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh4 =
-				_mm_load_si128
-					((void *)(&rxdp[4].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh3 =
-				_mm_load_si128
-					((void *)(&rxdp[3].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh2 =
-				_mm_load_si128
-					((void *)(&rxdp[2].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh1 =
-				_mm_load_si128
-					((void *)(&rxdp[1].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh0 =
-				_mm_load_si128
-					((void *)(&rxdp[0].wb.status_error1));
-
-			__m256i raw_desc_bh6_7 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh6),
-					 raw_desc_bh7, 1);
-			__m256i raw_desc_bh4_5 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh4),
-					 raw_desc_bh5, 1);
-			__m256i raw_desc_bh2_3 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh2),
-					 raw_desc_bh3, 1);
-			__m256i raw_desc_bh0_1 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh0),
-					 raw_desc_bh1, 1);
-
-			/**
-			 * to shift the 32b RSS hash value to the
-			 * highest 32b of each 128b before mask
-			 */
-			__m256i rss_hash6_7 =
-				_mm256_slli_epi64(raw_desc_bh6_7, 32);
-			__m256i rss_hash4_5 =
-				_mm256_slli_epi64(raw_desc_bh4_5, 32);
-			__m256i rss_hash2_3 =
-				_mm256_slli_epi64(raw_desc_bh2_3, 32);
-			__m256i rss_hash0_1 =
-				_mm256_slli_epi64(raw_desc_bh0_1, 32);
-
-			__m256i rss_hash_msk =
-				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
-						 0xFFFFFFFF, 0, 0, 0);
-
-			rss_hash6_7 = _mm256_and_si256
-					(rss_hash6_7, rss_hash_msk);
-			rss_hash4_5 = _mm256_and_si256
-					(rss_hash4_5, rss_hash_msk);
-			rss_hash2_3 = _mm256_and_si256
-					(rss_hash2_3, rss_hash_msk);
-			rss_hash0_1 = _mm256_and_si256
-					(rss_hash0_1, rss_hash_msk);
-
-			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
-			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
-			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
-			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
-		} /* if() on RSS hash parsing */
-#endif
-
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -1039,7 +867,7 @@
 		rearm0 = _mm256_blend_epi32(mbuf_init,
 					    _mm256_srli_si256(mbuf_flags, 4),
 					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
+		/* permute to add in the rx_descriptor */
 		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
 		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
 		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
@@ -1881,6 +1709,711 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd_offload(struct iavf_rx_queue *rxq,
+						struct rte_mbuf **rx_pkts,
+						uint16_t nb_pkts,
+						uint8_t *split_packet)
+{
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+						    rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union iavf_rx_flex_desc *rxdp =
+		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+		iavf_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.status_error0 &
+	      rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 2nd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 3rd descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0,             /* ignore pkt_type field */
+			 /* 4th descriptor */
+			 0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore pkt_type field */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
+
+	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 2nd descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 3rd descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 4th descriptor */
+			 0xFFFFFFFF,    /* rss hash parsed separately */
+			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
+					/* octet 4~5, 16 bits data_len */
+			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
+					/* octet 4~5, 16 bits pkt_len */
+			 0xFFFFFFFF     /* pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except Checksum Reports, RSS indication
+	 * and VLAN indication.
+	 * bit6:4 for IP/L4 checksum errors.
+	 * bit12 is for RSS indication.
+	 * bit13 is for VLAN indication.
+	 */
+	const __m256i flags_mask =
+		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 4
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+			/* shift right 1 bit to make sure it not exceed 255 */
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			/* second 128-bits */
+			0, 0, 0, 0, 0, 0, 0, 0,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
+	const __m256i cksum_mask =
+		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+				  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 12.
+	 * If RSS(bit12)/VLAN(bit13) are set,
+	 * shuffle moves appropriate flags in place.
+	 */
+	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_RSS_HASH, 0,
+			/* end up 128-bits */
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_RSS_HASH, 0);
+#endif
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IAVF_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
+		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
+
+		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m512i ptype_mask =
+			_mm512_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
+		const __m512i ptypes4_7 =
+			_mm512_and_si512(raw_desc4_7, ptype_mask);
+		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype7],
+			 0, 0, 0, type_table[ptype6],
+			 0, 0, 0, type_table[ptype5],
+			 0, 0, 0, type_table[ptype4]);
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
+
+		/**
+		 * convert descriptors 0-3 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
+
+		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m512i ptypes0_3 =
+			_mm512_and_si512(raw_desc0_3, ptype_mask);
+		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
+
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype3],
+			 0, 0, 0, type_table[ptype2],
+			 0, 0, 0, type_table[ptype1],
+			 0, 0, 0, type_table[ptype0]);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(raw_desc4_7, status_permute_msk, raw_desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 4));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		/* set rss and vlan flags */
+		const __m256i rss_vlan_flag_bits =
+			_mm256_srli_epi32(flag_bits, 12);
+		const __m256i rss_vlan_flags =
+			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+					    rss_vlan_flag_bits);
+#endif
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+#ifdef IAVF_RX_CSUM_OFFLOAD
+		mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+		mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags);
+#endif
+
+#ifdef IAVF_RX_FDIR_OFFLOAD
+		if (rxq->fdir_enabled) {
+			const __m512i fdir_permute_mask = _mm512_set_epi32
+				(0, 0, 0, 0,
+				 0, 0, 0, 0,
+				 7, 15, 23, 31,
+				 3, 11, 19, 27);
+			__m512i fdir_tmp = _mm512_permutex2var_epi32
+				(raw_desc0_3, fdir_permute_mask, raw_desc4_7);
+			const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
+				(fdir_tmp, 0);
+			const __m256i fdir_flags =
+				flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
+
+			/* merge with fdir_flags */
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
+
+			/* write to mbuf: have to use scalar store here */
+			rx_pkts[i + 0]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 3);
+
+			rx_pkts[i + 1]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 7);
+
+			rx_pkts[i + 2]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 2);
+
+			rx_pkts[i + 3]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 6);
+
+			rx_pkts[i + 4]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 1);
+
+			rx_pkts[i + 5]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 5);
+
+			rx_pkts[i + 6]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 0);
+
+			rx_pkts[i + 7]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_id0_7, 4);
+		} /* if() on fdir_enabled */
+#endif
+
+		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+#ifdef IAVF_RX_RSS_OFFLOAD
+		/**
+		 * needs to load 2nd 16B of each desc for RSS hash parsing,
+		 * will cause performance drop to get into this context.
+		 */
+		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+		    DEV_RX_OFFLOAD_RSS_HASH) {
+			/* load bottom half of every 32B desc */
+			const __m128i raw_desc_bh7 =
+				_mm_load_si128
+					((void *)(&rxdp[7].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh6 =
+				_mm_load_si128
+					((void *)(&rxdp[6].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh5 =
+				_mm_load_si128
+					((void *)(&rxdp[5].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh4 =
+				_mm_load_si128
+					((void *)(&rxdp[4].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh3 =
+				_mm_load_si128
+					((void *)(&rxdp[3].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh2 =
+				_mm_load_si128
+					((void *)(&rxdp[2].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh1 =
+				_mm_load_si128
+					((void *)(&rxdp[1].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh0 =
+				_mm_load_si128
+					((void *)(&rxdp[0].wb.status_error1));
+
+			__m256i raw_desc_bh6_7 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh6),
+					 raw_desc_bh7, 1);
+			__m256i raw_desc_bh4_5 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh4),
+					 raw_desc_bh5, 1);
+			__m256i raw_desc_bh2_3 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh2),
+					 raw_desc_bh3, 1);
+			__m256i raw_desc_bh0_1 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh0),
+					 raw_desc_bh1, 1);
+
+			/**
+			 * to shift the 32b RSS hash value to the
+			 * highest 32b of each 128b before mask
+			 */
+			__m256i rss_hash6_7 =
+				_mm256_slli_epi64(raw_desc_bh6_7, 32);
+			__m256i rss_hash4_5 =
+				_mm256_slli_epi64(raw_desc_bh4_5, 32);
+			__m256i rss_hash2_3 =
+				_mm256_slli_epi64(raw_desc_bh2_3, 32);
+			__m256i rss_hash0_1 =
+				_mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+			__m256i rss_hash_msk =
+				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+						 0xFFFFFFFF, 0, 0, 0);
+
+			rss_hash6_7 = _mm256_and_si256
+					(rss_hash6_7, rss_hash_msk);
+			rss_hash4_5 = _mm256_and_si256
+					(rss_hash4_5, rss_hash_msk);
+			rss_hash2_3 = _mm256_and_si256
+					(rss_hash2_3, rss_hash_msk);
+			rss_hash0_1 = _mm256_and_si256
+					(rss_hash0_1, rss_hash_msk);
+
+			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+		} /* if() on RSS hash parsing */
+#endif
+#endif
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+						    16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 <<
+					       IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+							      eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+uint16_t
+iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd_offload(rx_queue,
+							       rx_pkts,
+							       nb_pkts, NULL);
+}
+
+static uint16_t
+iavf_recv_scattered_burst_vec_avx512_flex_rxd_offload(void *rx_queue,
+						      struct rte_mbuf **rx_pkts,
+						      uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs =
+		_iavf_recv_raw_pkts_vec_avx512_flex_rxd_offload(rxq,
+								rx_pkts, nb_pkts,
+								split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+				      &split_flags[i]);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst =
+			iavf_recv_scattered_burst_vec_avx512_flex_rxd_offload
+				(rx_queue, rx_pkts + retval,
+				 IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+	return retval +
+		iavf_recv_scattered_burst_vec_avx512_flex_rxd_offload(rx_queue,
+			rx_pkts + retval, nb_pkts);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512
  2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
                   ` (4 preceding siblings ...)
  2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
@ 2021-03-26  1:31 ` Wenzhuo Lu
  2021-03-26  1:31   ` [dpdk-dev] [PATCH v3 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
                     ` (5 more replies)
  5 siblings, 6 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-26  1:31 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like,
checksum, VLAN, RSS offload.
These paths are chosen automatically according to the
configuration.

v2:
 - Fixed compile error.

v3:
 - Used 'inline' to drop the duplicate code.
 - some minor change.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c            | 187 +++++--
 drivers/net/iavf/iavf_rxtx.h            |  33 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 901 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
 5 files changed, 820 insertions(+), 423 deletions(-)

-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 1/4] net/iavf: store offload flag of Rx queue
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
@ 2021-03-26  1:31   ` Wenzhuo Lu
  2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-26  1:31 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c | 4 ++++
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 8fafe45..bf1064d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -498,9 +498,12 @@
 	uint8_t proto_xtr;
 	uint16_t len;
 	uint16_t rx_free_thresh;
+	uint64_t offloads;
 
 	PMD_INIT_FUNC_TRACE();
 
+	offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
 	if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
 	    nb_desc > IAVF_MAX_RING_DESC ||
 	    nb_desc < IAVF_MIN_RING_DESC) {
@@ -571,6 +574,7 @@
 	rxq->rx_deferred_start = rx_conf->rx_deferred_start;
 	rxq->rx_hdr_len = 0;
 	rxq->vsi = vsi;
+	rxq->offloads = offloads;
 
 	if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
 		rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 922ddad..06ff528 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
 		/* flexible descriptor metadata extraction offload flag */
 	iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
 				/* handle flexible descriptor by RXDID */
+	uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 2/4] net/iavf: add offload path for Tx AVX512
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-26  1:31   ` [dpdk-dev] [PATCH v3 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
@ 2021-03-26  1:32   ` Wenzhuo Lu
  2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-26  1:32 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  57 +++++++++++------
 drivers/net/iavf/iavf_rxtx.h            |  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++++++++++++++++++++++++++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bf1064d..4d9b3b9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -135,7 +135,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2473,17 +2473,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2491,15 +2497,29 @@
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH) {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+			} else {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).",
+					    dev->data->port_id);
+			}
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2519,8 +2539,9 @@
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 06ff528..821e791 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,21 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
+		DEV_TX_OFFLOAD_IPV4_CKSUM |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -487,6 +494,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 5cb4c7c..c30f3f0 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1515,14 +1515,16 @@
 		txep[i].mbuf = tx_pkts[i];
 }
 
-static inline void
+static __rte_always_inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
-	  struct rte_mbuf *pkt, uint64_t flags)
+	  struct rte_mbuf *pkt, uint64_t flags, bool offload)
 {
 	uint64_t high_qw =
 		(IAVF_TX_DESC_DTYPE_DATA |
 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	if (offload)
+		iavf_txd_enable_offload(pkt, &high_qw);
 
 	__m128i descriptor = _mm_set_epi64x(high_qw,
 					    pkt->buf_iova + pkt->data_off);
@@ -1531,62 +1533,70 @@
 
 #define IAVF_TX_LEN_MASK 0xAA
 #define IAVF_TX_OFF_MASK 0x55
-static inline void
+static __rte_always_inline void
 iavf_vtx(volatile struct iavf_tx_desc *txdp,
-	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
+	 bool offload)
 {
 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
 
 	/* if unaligned on 32-bit boundary, do one to align */
 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		nb_pkts--, txdp++, pkt++;
 	}
 
 	/* do 4 at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
-		__m512i desc4 =
-			_mm512_set_epi64
-				((uint64_t)pkt[3]->data_len,
-				 pkt[3]->buf_iova,
-				 (uint64_t)pkt[2]->data_len,
-				 pkt[2]->buf_iova,
-				 (uint64_t)pkt[1]->data_len,
-				 pkt[1]->buf_iova,
-				 (uint64_t)pkt[0]->data_len,
-				 pkt[0]->buf_iova);
-		__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
-		__m512i data_off_4 =
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
 			_mm512_set_epi64
-				(0,
-				 pkt[3]->data_off,
-				 0,
-				 pkt[2]->data_off,
-				 0,
-				 pkt[1]->data_off,
-				 0,
-				 pkt[0]->data_off);
-
-		desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					       IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
-		desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					     hi_qw_tmpl_4);
-		desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
-					      data_off_4);
-		_mm512_storeu_si512((void *)txdp, desc4);
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
 	while (nb_pkts) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		txdp++, pkt++, nb_pkts--;
 	}
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-				 uint16_t nb_pkts)
+				 uint16_t nb_pkts, bool offload)
 {
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 	volatile struct iavf_tx_desc *txdp;
@@ -1617,11 +1627,11 @@
 	if (nb_commit >= n) {
 		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
-		iavf_vtx(txdp, tx_pkts, n - 1, flags);
+		iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
 		tx_pkts += (n - 1);
 		txdp += (n - 1);
 
-		iavf_vtx1(txdp, *tx_pkts++, rs);
+		iavf_vtx1(txdp, *tx_pkts++, rs, offload);
 
 		nb_commit = (uint16_t)(nb_commit - n);
 
@@ -1636,7 +1646,7 @@
 
 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
-	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
 	if (tx_id > txq->next_rs) {
@@ -1654,9 +1664,9 @@
 	return nb_pkts;
 }
 
-uint16_t
-iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-			  uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts, bool offload)
 {
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
@@ -1666,7 +1676,7 @@
 
 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
-						       num);
+						       num, offload);
 		nb_tx += ret;
 		nb_pkts -= ret;
 		if (ret < num)
@@ -1676,6 +1686,13 @@
 	return nb_tx;
 }
 
+uint16_t
+iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
+}
+
 static inline void
 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
 {
@@ -1706,3 +1723,10 @@
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 46a1873..bd7efca 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -236,14 +236,17 @@
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -266,14 +269,97 @@
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static __rte_always_inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #endif
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 3/4] net/iavf: add offload path for Rx AVX512
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
  2021-03-26  1:31   ` [dpdk-dev] [PATCH v3 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
  2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
@ 2021-03-26  1:32   ` Wenzhuo Lu
  2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-26  1:32 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            | 105 +++++++---
 drivers/net/iavf/iavf_rxtx.h            |  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 355 ++++++++++++++++++++------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 325 insertions(+), 164 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4d9b3b9..1a03d97 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2367,22 +2367,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_rx_queue *rxq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
+	bool use_flex = false;
 
-	if (!iavf_rx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		for (i = 0; i < dev->data->nb_rx_queues; i++) {
-			rxq = dev->data->rx_queues[i];
-			(void)iavf_rxq_vec_setup(rxq);
+	check_ret = iavf_rx_vec_dev_check(dev);
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2390,13 +2391,38 @@
 			use_avx512 = true;
 #endif
 
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (vf->vf_res->vf_cap_flags &
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			use_flex = true;
+			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+				use_flex = false;
+		}
+
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)iavf_rxq_vec_setup(rxq);
+		}
+
 		if (dev->data->scattered_rx) {
-			PMD_DRV_LOG(DEBUG,
-				    "Using %sVector Scattered Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG,
+					    "Using %sVector Scattered Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2410,17 +2436,32 @@
 					iavf_recv_scattered_pkts_vec_avx2 :
 					iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		} else {
-			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
@@ -2434,17 +2475,23 @@
 					iavf_recv_pkts_vec_avx2 :
 					iavf_recv_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		}
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	if (dev->data->scattered_rx) {
 		PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
 			    dev->data->port_id);
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 821e791..d0e5909 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -35,6 +35,12 @@
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_RX_VECTOR_OFFLOAD (				 \
+		DEV_RX_OFFLOAD_CHECKSUM |		 \
+		DEV_RX_OFFLOAD_SCTP_CKSUM |		 \
+		DEV_RX_OFFLOAD_VLAN |		 \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
 #define IAVF_VECTOR_PATH 0
 #define IAVF_VECTOR_OFFLOAD_PATH 1
 
@@ -483,12 +489,18 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index c30f3f0..da2742e 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -13,7 +13,23 @@
 #define IAVF_DESCS_PER_LOOP_AVX 8
 #define PKTLEN_SHIFT 10
 
-static inline void
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in RX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ stripping
+ * 3, RSS hash
+ * 4, packet type analysis
+ * 5, flow director ID report
+ ******************************************************************************/
+#define IAVF_RX_CSUM_OFFLOAD
+#define IAVF_RX_VLAN_OFFLOAD
+#define IAVF_RX_RSS_OFFLOAD
+#define IAVF_RX_PTYPE_OFFLOAD
+#define IAVF_RX_FDIR_OFFLOAD
+
+static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
 	int i;
@@ -141,12 +157,15 @@
 }
 
 #define IAVF_RX_LEN_MASK 0x80808080
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       struct rte_mbuf **rx_pkts,
-			       uint16_t nb_pkts, uint8_t *split_packet)
+			       uint16_t nb_pkts, uint8_t *split_packet,
+			       bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -249,71 +268,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except RSS, flow director and VLAN flags
-	 * bit2 is for VLAN tag, bit11 for flow director indication
-	 * bit13:12 for RSS indication. Bits 3-5 of error
-	 * field (bits 22-24) are for IP/L4 checksum errors
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((1 << 2) | (1 << 11) |
-				  (3 << 12) | (7 << 22));
-	/**
-	 * data to be shuffled by result of flag mask. If VLAN bit is set,
-	 * (bit 2), then position 4 in this array will be used in the
-	 * destination
-	 */
-	const __m256i vlan_flags_shuf =
-		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
-				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 11.
-	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
-	 * place.
-	 */
-	const __m256i rss_flags_shuf =
-		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
-				0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0);
-
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 22
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
-
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -381,6 +335,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, shift 64-bit values down 30 bits
 		 * and so ptype is in lower 8-bits in each
@@ -399,6 +354,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, adjusting length and
@@ -412,6 +368,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/* get the packet types */
 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
@@ -427,6 +384,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -446,27 +404,122 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/* set vlan and rss flags */
-		const __m256i vlan_flags =
-			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags =
-			_mm256_shuffle_epi8(rss_flags_shuf,
-					    _mm256_srli_epi32(flag_bits, 11));
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-						_mm256_srli_epi32(flag_bits, 22));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-
 		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-				_mm256_or_si256(rss_flags, vlan_flags));
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except RSS, flow director and VLAN flags
+			 * bit2 is for VLAN tag, bit11 for flow director indication
+			 * bit13:12 for RSS indication. Bits 3-5 of error
+			 * field (bits 22-24) are for IP/L4 checksum errors
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((1 << 2) | (1 << 11) |
+						  (3 << 12) | (7 << 22));
+#endif
+
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask. If VLAN bit is set,
+			 * (bit 2), then position 4 in this array will be used in the
+			 * destination
+			 */
+			const __m256i vlan_flags_shuf =
+				_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+						 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+#endif
+
+#ifdef IAVF_RX_RSS_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 11.
+			 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+			 * place.
+			 */
+			const __m256i rss_flags_shuf =
+				_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
+						0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 22
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+			/* set vlan and rss flags */
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			const __m256i vlan_flags =
+				_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			const __m256i rss_flags =
+				_mm256_shuffle_epi8(rss_flags_shuf,
+						    _mm256_srli_epi32(flag_bits, 11));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+							_mm256_srli_epi32(flag_bits, 22));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
+#endif
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
+#endif
+		}
+
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -484,7 +537,7 @@
 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
 				 RTE_ALIGN(offsetof(struct rte_mbuf,
 						    rearm_data),
-					   16));
+						    16));
 		/* build up data and do writes */
 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
 			rearm6, rearm7;
@@ -493,21 +546,28 @@
 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-		rearm6 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 8),
-					    0x04);
-		rearm4 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 4),
-					    0x04);
-		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
-		rearm0 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(mbuf_flags, 4),
-					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
-		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
-		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
-		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
-		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		if (offload) {
+			rearm6 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 8),
+						    0x04);
+			rearm4 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 4),
+						    0x04);
+			rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+			rearm0 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(mbuf_flags, 4),
+						    0x04);
+			/* permute to add in the rx_descriptor e.g. rss fields */
+			rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		} else {
+			rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
+		}
 		/* write to mbuf */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 				    rearm6);
@@ -519,24 +579,31 @@
 				    rearm0);
 
 		/* repeat for the odd mbufs */
-		const __m256i odd_flags =
-			_mm256_castsi128_si256
-				(_mm256_extracti128_si256(mbuf_flags, 1));
-		rearm7 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 8),
-					    0x04);
-		rearm5 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 4),
-					    0x04);
-		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
-		rearm1 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(odd_flags, 4),
-					    0x04);
-		/* since odd mbufs are already in hi 128-bits use blend */
-		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
-		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
-		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
-		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		if (offload) {
+			const __m256i odd_flags =
+				_mm256_castsi128_si256
+					(_mm256_extracti128_si256(mbuf_flags, 1));
+			rearm7 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 8),
+						    0x04);
+			rearm5 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 4),
+						    0x04);
+			rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+			rearm1 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(odd_flags, 4),
+						    0x04);
+			/* since odd mbufs are already in hi 128-bits use blend */
+			rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		} else {
+			rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
+		}
 		/* again write to mbufs */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 				    rearm7);
@@ -1247,7 +1314,8 @@
 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts)
 {
-	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts,
+					      NULL, false);
 }
 
 /**
@@ -1267,16 +1335,16 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				     uint16_t nb_pkts)
+				     uint16_t nb_pkts, bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
-							  split_flags);
+							  split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1309,22 +1377,30 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				    uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkts,
+					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, false);
 }
 
 /**
@@ -1397,6 +1473,23 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts,
+					      nb_pkts, NULL, true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index bd7efca..40bf56b 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -11,7 +11,7 @@
 #include "iavf.h"
 #include "iavf_rxtx.h"
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 reassemble_packets(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_bufs,
 		   uint16_t nb_bufs, uint8_t *split_flags)
 {
@@ -227,7 +227,10 @@
 	if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE)
 		return -1;
 
-	return 0;
+	if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -254,14 +257,20 @@
 {
 	int i;
 	struct iavf_rx_queue *rxq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		if (iavf_rx_vec_queue_default(rxq))
+		ret = iavf_rx_vec_queue_default(rxq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
 }
 
 static inline int
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
                     ` (2 preceding siblings ...)
  2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
@ 2021-03-26  1:32   ` Wenzhuo Lu
       [not found]   ` <DM5PR11MB1787A1E056D50128FDDDDA5D8C749@DM5PR11MB1787.namprd11.prod.outlook.com>
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-03-26  1:32 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c            |  27 +-
 drivers/net/iavf/iavf_rxtx.h            |   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++++++++++++++++++--------------
 4 files changed, 283 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 21dc6d2..a7769f8 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -83,6 +83,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+    offload features are configured to be used, the offload paths are chosen
+    automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 1a03d97..d72393c 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2395,11 +2395,8 @@
 			goto normal;
 
 		if (vf->vf_res->vf_cap_flags &
-			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
 			use_flex = true;
-			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-				use_flex = false;
-		}
 
 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
 			rxq = dev->data->rx_queues[i];
@@ -2427,9 +2424,14 @@
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
@@ -2466,9 +2468,14 @@
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index d0e5909..fb16d18 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -495,6 +495,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						    struct rte_mbuf **rx_pkts,
+						    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
@@ -504,6 +507,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+							      struct rte_mbuf **rx_pkts,
+							      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index da2742e..343eb36 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -681,7 +681,7 @@
 	return received;
 }
 
-static inline __m256i
+static __rte_always_inline __m256i
 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
 {
 #define FDID_MIS_MAGIC 0xFFFFFFFF
@@ -700,12 +700,16 @@
 	return fdir_flags;
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					struct rte_mbuf **rx_pkts,
-					uint16_t nb_pkts, uint8_t *split_packet)
+					uint16_t nb_pkts,
+					uint8_t *split_packet,
+					bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -809,71 +813,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except Checksum Reports, RSS indication
-	 * and VLAN indication.
-	 * bit6:4 for IP/L4 checksum errors.
-	 * bit12 is for RSS indication.
-	 * bit13 is for VLAN indication.
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 4
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 12.
-	 * If RSS(bit12)/VLAN(bit13) are set,
-	 * shuffle moves appropriate flags in place.
-	 */
-	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0,
-			/* end up 128-bits */
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -937,6 +876,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -958,6 +898,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
@@ -966,6 +907,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -985,6 +927,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -1004,28 +947,114 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-				_mm256_srli_epi32(flag_bits, 4));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-		/* set rss and vlan flags */
-		const __m256i rss_vlan_flag_bits =
-			_mm256_srli_epi32(flag_bits, 12);
-		const __m256i rss_vlan_flags =
-			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
-					    rss_vlan_flag_bits);
-
 		/* merge flags */
-		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-						     rss_vlan_flags);
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except Checksum Reports, RSS indication
+			 * and VLAN indication.
+			 * bit6:4 for IP/L4 checksum errors.
+			 * bit12 is for RSS indication.
+			 * bit13 is for VLAN indication.
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 4
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 12.
+			 * If RSS(bit12)/VLAN(bit13) are set,
+			 * shuffle moves appropriate flags in place.
+			 */
+			const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_RSS_HASH, 0,
+					/* end up 128-bits */
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_RSS_HASH, 0);
+#endif
 
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+					_mm256_srli_epi32(flag_bits, 4));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* set rss and vlan flags */
+			const __m256i rss_vlan_flag_bits =
+				_mm256_srli_epi32(flag_bits, 12);
+			const __m256i rss_vlan_flags =
+				_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+						    rss_vlan_flag_bits);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags);
+#endif
+		}
+
+#ifdef IAVF_RX_FDIR_OFFLOAD
 		if (rxq->fdir_enabled) {
 			const __m512i fdir_permute_mask = _mm512_set_epi32
 				(0, 0, 0, 0,
@@ -1067,6 +1096,7 @@
 			rx_pkts[i + 7]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 4);
 		} /* if() on fdir_enabled */
+#endif
 
 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
@@ -1074,93 +1104,97 @@
 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-		/**
-		 * needs to load 2nd 16B of each desc for RSS hash parsing,
-		 * will cause performance drop to get into this context.
-		 */
-		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
-		    DEV_RX_OFFLOAD_RSS_HASH) {
-			/* load bottom half of every 32B desc */
-			const __m128i raw_desc_bh7 =
-				_mm_load_si128
-					((void *)(&rxdp[7].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh6 =
-				_mm_load_si128
-					((void *)(&rxdp[6].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh5 =
-				_mm_load_si128
-					((void *)(&rxdp[5].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh4 =
-				_mm_load_si128
-					((void *)(&rxdp[4].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh3 =
-				_mm_load_si128
-					((void *)(&rxdp[3].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh2 =
-				_mm_load_si128
-					((void *)(&rxdp[2].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh1 =
-				_mm_load_si128
-					((void *)(&rxdp[1].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh0 =
-				_mm_load_si128
-					((void *)(&rxdp[0].wb.status_error1));
-
-			__m256i raw_desc_bh6_7 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh6),
-					 raw_desc_bh7, 1);
-			__m256i raw_desc_bh4_5 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh4),
-					 raw_desc_bh5, 1);
-			__m256i raw_desc_bh2_3 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh2),
-					 raw_desc_bh3, 1);
-			__m256i raw_desc_bh0_1 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh0),
-					 raw_desc_bh1, 1);
-
+		if (offload) {
+#ifdef IAVF_RX_RSS_OFFLOAD
 			/**
-			 * to shift the 32b RSS hash value to the
-			 * highest 32b of each 128b before mask
+			 * needs to load 2nd 16B of each desc for RSS hash parsing,
+			 * will cause performance drop to get into this context.
 			 */
-			__m256i rss_hash6_7 =
-				_mm256_slli_epi64(raw_desc_bh6_7, 32);
-			__m256i rss_hash4_5 =
-				_mm256_slli_epi64(raw_desc_bh4_5, 32);
-			__m256i rss_hash2_3 =
-				_mm256_slli_epi64(raw_desc_bh2_3, 32);
-			__m256i rss_hash0_1 =
-				_mm256_slli_epi64(raw_desc_bh0_1, 32);
-
-			__m256i rss_hash_msk =
-				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
-						 0xFFFFFFFF, 0, 0, 0);
-
-			rss_hash6_7 = _mm256_and_si256
-					(rss_hash6_7, rss_hash_msk);
-			rss_hash4_5 = _mm256_and_si256
-					(rss_hash4_5, rss_hash_msk);
-			rss_hash2_3 = _mm256_and_si256
-					(rss_hash2_3, rss_hash_msk);
-			rss_hash0_1 = _mm256_and_si256
-					(rss_hash0_1, rss_hash_msk);
-
-			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
-			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
-			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
-			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
-		} /* if() on RSS hash parsing */
+			if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+			    DEV_RX_OFFLOAD_RSS_HASH) {
+				/* load bottom half of every 32B desc */
+				const __m128i raw_desc_bh7 =
+					_mm_load_si128
+						((void *)(&rxdp[7].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh6 =
+					_mm_load_si128
+						((void *)(&rxdp[6].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh5 =
+					_mm_load_si128
+						((void *)(&rxdp[5].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh4 =
+					_mm_load_si128
+						((void *)(&rxdp[4].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh3 =
+					_mm_load_si128
+						((void *)(&rxdp[3].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh2 =
+					_mm_load_si128
+						((void *)(&rxdp[2].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh1 =
+					_mm_load_si128
+						((void *)(&rxdp[1].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh0 =
+					_mm_load_si128
+						((void *)(&rxdp[0].wb.status_error1));
+
+				__m256i raw_desc_bh6_7 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh6),
+						 raw_desc_bh7, 1);
+				__m256i raw_desc_bh4_5 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh4),
+						 raw_desc_bh5, 1);
+				__m256i raw_desc_bh2_3 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh2),
+						 raw_desc_bh3, 1);
+				__m256i raw_desc_bh0_1 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh0),
+						 raw_desc_bh1, 1);
+
+				/**
+				 * to shift the 32b RSS hash value to the
+				 * highest 32b of each 128b before mask
+				 */
+				__m256i rss_hash6_7 =
+					_mm256_slli_epi64(raw_desc_bh6_7, 32);
+				__m256i rss_hash4_5 =
+					_mm256_slli_epi64(raw_desc_bh4_5, 32);
+				__m256i rss_hash2_3 =
+					_mm256_slli_epi64(raw_desc_bh2_3, 32);
+				__m256i rss_hash0_1 =
+					_mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+				__m256i rss_hash_msk =
+					_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+							 0xFFFFFFFF, 0, 0, 0);
+
+				rss_hash6_7 = _mm256_and_si256
+						(rss_hash6_7, rss_hash_msk);
+				rss_hash4_5 = _mm256_and_si256
+						(rss_hash4_5, rss_hash_msk);
+				rss_hash2_3 = _mm256_and_si256
+						(rss_hash2_3, rss_hash_msk);
+				rss_hash0_1 = _mm256_and_si256
+						(rss_hash0_1, rss_hash_msk);
+
+				mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+				mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+				mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+				mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+			} /* if() on RSS hash parsing */
+#endif
+		}
 #endif
 
 		/**
@@ -1327,7 +1361,7 @@
 				   uint16_t nb_pkts)
 {
 	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts,
-						       nb_pkts, NULL);
+						       nb_pkts, NULL, false);
 }
 
 /**
@@ -1409,17 +1443,18 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue,
 					      struct rte_mbuf **rx_pkts,
-					      uint16_t nb_pkts)
+					      uint16_t nb_pkts,
+					      bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq,
-					rx_pkts, nb_pkts, split_flags);
+					rx_pkts, nb_pkts, split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1452,10 +1487,11 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
-					     struct rte_mbuf **rx_pkts,
-					     uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(void *rx_queue,
+						 struct rte_mbuf **rx_pkts,
+						 uint16_t nb_pkts,
+						 bool offload)
 {
 	uint16_t retval = 0;
 
@@ -1463,14 +1499,25 @@
 		uint16_t burst =
 			iavf_recv_scattered_burst_vec_avx512_flex_rxd
 				(rx_queue, rx_pkts + retval,
-				 IAVF_VPMD_RX_MAX_BURST);
+				 IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
+								rx_pkts,
+								nb_pkts,
+								false);
 }
 
 uint16_t
@@ -1490,6 +1537,29 @@
 						       nb_pkts, true);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue,
+						       rx_pkts,
+						       nb_pkts,
+						       NULL,
+						       true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
+								rx_pkts,
+								nb_pkts,
+								true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512
       [not found]   ` <DM5PR11MB1787A1E056D50128FDDDDA5D8C749@DM5PR11MB1787.namprd11.prod.outlook.com>
@ 2021-04-08  8:44     ` Rong, Leyi
  0 siblings, 0 replies; 30+ messages in thread
From: Rong, Leyi @ 2021-04-08  8:44 UTC (permalink / raw)
  To: Lu, Wenzhuo, Zhang, Qi Z; +Cc: dev


> -----Original Message-----
> From: Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Sent: Thursday, April 8, 2021 4:39 PM
> To: Rong, Leyi <leyi.rong@intel.com>
> Subject: FW: [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512
> 
> 
> 
> 
> Best regards
> Wenzhuo Lu
> 
> -----Original Message-----
> From: Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Sent: Friday, March 26, 2021 9:32 AM
> To: dev@dpdk.org
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512
> 
> Add specific paths for RX/TX AVX512, called offload paths.
> In these paths, support the HW offload features, like, checksum, VLAN, RSS
> offload.
> These paths are chosen automatically according to the configuration.
> 
> v2:
>  - Fixed compile error.
> 
> v3:
>  - Used 'inline' to drop the duplicate code.
>  - some minor change.
> 
> Wenzhuo Lu (4):
>   net/iavf: store offload flag of Rx queue
>   net/iavf: add offload path for Tx AVX512
>   net/iavf: add offload path for Rx AVX512
>   net/iavf: add offload path for Rx AVX512 flex desc
> 
>  doc/guides/rel_notes/release_21_05.rst  |   7 +
>  drivers/net/iavf/iavf_rxtx.c            | 187 +++++--
>  drivers/net/iavf/iavf_rxtx.h            |  33 +-
>  drivers/net/iavf/iavf_rxtx_vec_avx512.c | 901 +++++++++++++++++++------------
> -  drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
>  5 files changed, 820 insertions(+), 423 deletions(-)
> 
> --
> 1.9.3


Acked-by: Leyi Rong <leyi.rong@intel.com>


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512
  2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
                     ` (4 preceding siblings ...)
       [not found]   ` <DM5PR11MB1787A1E056D50128FDDDDA5D8C749@DM5PR11MB1787.namprd11.prod.outlook.com>
@ 2021-04-09  5:59   ` Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
                       ` (5 more replies)
  5 siblings, 6 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-09  5:59 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS offload.
These paths are chosen automatically according to the configuration.

v2:
 - Fixed compile error.

v3:
 - Used 'inline' to drop the duplicate code.
 - some minor change.

v4:
 - Rebased on next-net-intel.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c            | 187 +++++--
 drivers/net/iavf/iavf_rxtx.h            |  33 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
 5 files changed, 819 insertions(+), 422 deletions(-)

-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] net/iavf: store offload flag of Rx queue
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
@ 2021-04-09  5:59     ` Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
                       ` (4 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-09  5:59 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c | 4 ++++
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 541b444..bd0b7ee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -523,9 +523,12 @@
 	uint8_t proto_xtr;
 	uint16_t len;
 	uint16_t rx_free_thresh;
+	uint64_t offloads;
 
 	PMD_INIT_FUNC_TRACE();
 
+	offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
 	if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
 	    nb_desc > IAVF_MAX_RING_DESC ||
 	    nb_desc < IAVF_MIN_RING_DESC) {
@@ -596,6 +599,7 @@
 	rxq->rx_deferred_start = rx_conf->rx_deferred_start;
 	rxq->rx_hdr_len = 0;
 	rxq->vsi = vsi;
+	rxq->offloads = offloads;
 
 	if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
 		rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 4fbd847..f56dd74 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
 		/* flexible descriptor metadata extraction offload flag */
 	iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
 				/* handle flexible descriptor by RXDID */
+	uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 2/4] net/iavf: add offload path for Tx AVX512
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
@ 2021-04-09  5:59     ` Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
                       ` (3 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-09  5:59 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  57 +++++++++++------
 drivers/net/iavf/iavf_rxtx.h            |  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++++++++++++++++++++++++++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bd0b7ee..099ede7 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -160,7 +160,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2498,17 +2498,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2516,15 +2522,29 @@
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH) {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+			} else {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).",
+					    dev->data->port_id);
+			}
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2544,8 +2564,9 @@
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index f56dd74..bead119 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,21 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
+		DEV_TX_OFFLOAD_IPV4_CKSUM |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -488,6 +495,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 2927a7c..fbbf4b9 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1518,14 +1518,16 @@
 		txep[i].mbuf = tx_pkts[i];
 }
 
-static inline void
+static __rte_always_inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
-	  struct rte_mbuf *pkt, uint64_t flags)
+	  struct rte_mbuf *pkt, uint64_t flags, bool offload)
 {
 	uint64_t high_qw =
 		(IAVF_TX_DESC_DTYPE_DATA |
 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	if (offload)
+		iavf_txd_enable_offload(pkt, &high_qw);
 
 	__m128i descriptor = _mm_set_epi64x(high_qw,
 					    pkt->buf_iova + pkt->data_off);
@@ -1534,62 +1536,70 @@
 
 #define IAVF_TX_LEN_MASK 0xAA
 #define IAVF_TX_OFF_MASK 0x55
-static inline void
+static __rte_always_inline void
 iavf_vtx(volatile struct iavf_tx_desc *txdp,
-	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
+	 bool offload)
 {
 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
 
 	/* if unaligned on 32-bit boundary, do one to align */
 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		nb_pkts--, txdp++, pkt++;
 	}
 
 	/* do 4 at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
-		__m512i desc4 =
-			_mm512_set_epi64
-				((uint64_t)pkt[3]->data_len,
-				 pkt[3]->buf_iova,
-				 (uint64_t)pkt[2]->data_len,
-				 pkt[2]->buf_iova,
-				 (uint64_t)pkt[1]->data_len,
-				 pkt[1]->buf_iova,
-				 (uint64_t)pkt[0]->data_len,
-				 pkt[0]->buf_iova);
-		__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
-		__m512i data_off_4 =
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
 			_mm512_set_epi64
-				(0,
-				 pkt[3]->data_off,
-				 0,
-				 pkt[2]->data_off,
-				 0,
-				 pkt[1]->data_off,
-				 0,
-				 pkt[0]->data_off);
-
-		desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					       IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
-		desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					     hi_qw_tmpl_4);
-		desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
-					      data_off_4);
-		_mm512_storeu_si512((void *)txdp, desc4);
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
 	while (nb_pkts) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		txdp++, pkt++, nb_pkts--;
 	}
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-				 uint16_t nb_pkts)
+				 uint16_t nb_pkts, bool offload)
 {
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 	volatile struct iavf_tx_desc *txdp;
@@ -1620,11 +1630,11 @@
 	if (nb_commit >= n) {
 		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
-		iavf_vtx(txdp, tx_pkts, n - 1, flags);
+		iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
 		tx_pkts += (n - 1);
 		txdp += (n - 1);
 
-		iavf_vtx1(txdp, *tx_pkts++, rs);
+		iavf_vtx1(txdp, *tx_pkts++, rs, offload);
 
 		nb_commit = (uint16_t)(nb_commit - n);
 
@@ -1639,7 +1649,7 @@
 
 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
-	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
 	if (tx_id > txq->next_rs) {
@@ -1657,9 +1667,9 @@
 	return nb_pkts;
 }
 
-uint16_t
-iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-			  uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts, bool offload)
 {
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
@@ -1669,7 +1679,7 @@
 
 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
-						       num);
+						       num, offload);
 		nb_tx += ret;
 		nb_pkts -= ret;
 		if (ret < num)
@@ -1679,6 +1689,13 @@
 	return nb_tx;
 }
 
+uint16_t
+iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
+}
+
 static inline void
 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
 {
@@ -1709,3 +1726,10 @@
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 57b4381..8e96cb5 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -240,14 +240,17 @@
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -270,14 +273,97 @@
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static __rte_always_inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #ifdef RTE_ARCH_X86
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 3/4] net/iavf: add offload path for Rx AVX512
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
@ 2021-04-09  5:59     ` Wenzhuo Lu
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
                       ` (2 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-09  5:59 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            | 105 +++++++---
 drivers/net/iavf/iavf_rxtx.h            |  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 353 ++++++++++++++++++++------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 324 insertions(+), 163 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 099ede7..ca01ed9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2392,22 +2392,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_rx_queue *rxq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
+	bool use_flex = false;
 
-	if (!iavf_rx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		for (i = 0; i < dev->data->nb_rx_queues; i++) {
-			rxq = dev->data->rx_queues[i];
-			(void)iavf_rxq_vec_setup(rxq);
+	check_ret = iavf_rx_vec_dev_check(dev);
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2415,13 +2416,38 @@
 			use_avx512 = true;
 #endif
 
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (vf->vf_res->vf_cap_flags &
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			use_flex = true;
+			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+				use_flex = false;
+		}
+
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)iavf_rxq_vec_setup(rxq);
+		}
+
 		if (dev->data->scattered_rx) {
-			PMD_DRV_LOG(DEBUG,
-				    "Using %sVector Scattered Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG,
+					    "Using %sVector Scattered Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2435,17 +2461,32 @@
 					iavf_recv_scattered_pkts_vec_avx2 :
 					iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		} else {
-			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
@@ -2459,17 +2500,23 @@
 					iavf_recv_pkts_vec_avx2 :
 					iavf_recv_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		}
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	if (dev->data->scattered_rx) {
 		PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
 			    dev->data->port_id);
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index bead119..a8e5664 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -35,6 +35,12 @@
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_RX_VECTOR_OFFLOAD (				 \
+		DEV_RX_OFFLOAD_CHECKSUM |		 \
+		DEV_RX_OFFLOAD_SCTP_CKSUM |		 \
+		DEV_RX_OFFLOAD_VLAN |		 \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
 #define IAVF_VECTOR_PATH 0
 #define IAVF_VECTOR_OFFLOAD_PATH 1
 
@@ -484,12 +490,18 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index fbbf4b9..9030ca5 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -13,6 +13,22 @@
 #define IAVF_DESCS_PER_LOOP_AVX 8
 #define PKTLEN_SHIFT 10
 
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in RX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ stripping
+ * 3, RSS hash
+ * 4, packet type analysis
+ * 5, flow director ID report
+ ******************************************************************************/
+#define IAVF_RX_CSUM_OFFLOAD
+#define IAVF_RX_VLAN_OFFLOAD
+#define IAVF_RX_RSS_OFFLOAD
+#define IAVF_RX_PTYPE_OFFLOAD
+#define IAVF_RX_FDIR_OFFLOAD
+
 static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
@@ -144,12 +160,15 @@
 }
 
 #define IAVF_RX_LEN_MASK 0x80808080
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       struct rte_mbuf **rx_pkts,
-			       uint16_t nb_pkts, uint8_t *split_packet)
+			       uint16_t nb_pkts, uint8_t *split_packet,
+			       bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -252,71 +271,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except RSS, flow director and VLAN flags
-	 * bit2 is for VLAN tag, bit11 for flow director indication
-	 * bit13:12 for RSS indication. Bits 3-5 of error
-	 * field (bits 22-24) are for IP/L4 checksum errors
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((1 << 2) | (1 << 11) |
-				  (3 << 12) | (7 << 22));
-	/**
-	 * data to be shuffled by result of flag mask. If VLAN bit is set,
-	 * (bit 2), then position 4 in this array will be used in the
-	 * destination
-	 */
-	const __m256i vlan_flags_shuf =
-		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
-				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 11.
-	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
-	 * place.
-	 */
-	const __m256i rss_flags_shuf =
-		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
-				0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0);
-
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 22
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
-
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -384,6 +338,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, shift 64-bit values down 30 bits
 		 * and so ptype is in lower 8-bits in each
@@ -402,6 +357,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, adjusting length and
@@ -415,6 +371,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/* get the packet types */
 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
@@ -430,6 +387,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -449,27 +407,122 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/* set vlan and rss flags */
-		const __m256i vlan_flags =
-			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags =
-			_mm256_shuffle_epi8(rss_flags_shuf,
-					    _mm256_srli_epi32(flag_bits, 11));
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-						_mm256_srli_epi32(flag_bits, 22));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-
 		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-				_mm256_or_si256(rss_flags, vlan_flags));
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except RSS, flow director and VLAN flags
+			 * bit2 is for VLAN tag, bit11 for flow director indication
+			 * bit13:12 for RSS indication. Bits 3-5 of error
+			 * field (bits 22-24) are for IP/L4 checksum errors
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((1 << 2) | (1 << 11) |
+						  (3 << 12) | (7 << 22));
+#endif
+
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask. If VLAN bit is set,
+			 * (bit 2), then position 4 in this array will be used in the
+			 * destination
+			 */
+			const __m256i vlan_flags_shuf =
+				_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+						 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+#endif
+
+#ifdef IAVF_RX_RSS_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 11.
+			 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+			 * place.
+			 */
+			const __m256i rss_flags_shuf =
+				_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
+						0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 22
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+			/* set vlan and rss flags */
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			const __m256i vlan_flags =
+				_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			const __m256i rss_flags =
+				_mm256_shuffle_epi8(rss_flags_shuf,
+						    _mm256_srli_epi32(flag_bits, 11));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+							_mm256_srli_epi32(flag_bits, 22));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
+#endif
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
+#endif
+		}
+
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -487,7 +540,7 @@
 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
 				 RTE_ALIGN(offsetof(struct rte_mbuf,
 						    rearm_data),
-					   16));
+						    16));
 		/* build up data and do writes */
 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
 			rearm6, rearm7;
@@ -496,21 +549,28 @@
 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-		rearm6 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 8),
-					    0x04);
-		rearm4 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 4),
-					    0x04);
-		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
-		rearm0 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(mbuf_flags, 4),
-					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
-		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
-		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
-		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
-		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		if (offload) {
+			rearm6 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 8),
+						    0x04);
+			rearm4 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 4),
+						    0x04);
+			rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+			rearm0 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(mbuf_flags, 4),
+						    0x04);
+			/* permute to add in the rx_descriptor e.g. rss fields */
+			rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		} else {
+			rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
+		}
 		/* write to mbuf */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 				    rearm6);
@@ -522,24 +582,31 @@
 				    rearm0);
 
 		/* repeat for the odd mbufs */
-		const __m256i odd_flags =
-			_mm256_castsi128_si256
-				(_mm256_extracti128_si256(mbuf_flags, 1));
-		rearm7 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 8),
-					    0x04);
-		rearm5 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 4),
-					    0x04);
-		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
-		rearm1 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(odd_flags, 4),
-					    0x04);
-		/* since odd mbufs are already in hi 128-bits use blend */
-		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
-		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
-		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
-		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		if (offload) {
+			const __m256i odd_flags =
+				_mm256_castsi128_si256
+					(_mm256_extracti128_si256(mbuf_flags, 1));
+			rearm7 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 8),
+						    0x04);
+			rearm5 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 4),
+						    0x04);
+			rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+			rearm1 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(odd_flags, 4),
+						    0x04);
+			/* since odd mbufs are already in hi 128-bits use blend */
+			rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		} else {
+			rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
+		}
 		/* again write to mbufs */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 				    rearm7);
@@ -1250,7 +1317,8 @@
 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts)
 {
-	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts,
+					      NULL, false);
 }
 
 /**
@@ -1270,16 +1338,16 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				     uint16_t nb_pkts)
+				     uint16_t nb_pkts, bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
-							  split_flags);
+							  split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1312,22 +1380,30 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				    uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkts,
+					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, false);
 }
 
 /**
@@ -1400,6 +1476,23 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts,
+					      nb_pkts, NULL, true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 8e96cb5..d156d79 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -15,7 +15,7 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 reassemble_packets(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_bufs,
 		   uint16_t nb_bufs, uint8_t *split_flags)
 {
@@ -231,7 +231,10 @@
 	if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE)
 		return -1;
 
-	return 0;
+	if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -258,14 +261,20 @@
 {
 	int i;
 	struct iavf_rx_queue *rxq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		if (iavf_rx_vec_queue_default(rxq))
+		ret = iavf_rx_vec_queue_default(rxq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
 }
 
 static inline int
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
                       ` (2 preceding siblings ...)
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
@ 2021-04-09  5:59     ` Wenzhuo Lu
  2021-04-13 12:45       ` Ferruh Yigit
  2021-04-09  7:44     ` [dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512 Zhang, Qi Z
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
  5 siblings, 1 reply; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-09  5:59 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c            |  27 +-
 drivers/net/iavf/iavf_rxtx.h            |   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++++++++++++++++++--------------
 4 files changed, 283 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 95713f2..253917a 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -137,6 +137,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+    offload features are configured to be used, the offload paths are chosen
+    automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index ca01ed9..3f3cf63 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2420,11 +2420,8 @@
 			goto normal;
 
 		if (vf->vf_res->vf_cap_flags &
-			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
 			use_flex = true;
-			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-				use_flex = false;
-		}
 
 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
 			rxq = dev->data->rx_queues[i];
@@ -2452,9 +2449,14 @@
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
@@ -2491,9 +2493,14 @@
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index a8e5664..19b6028 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -496,6 +496,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						    struct rte_mbuf **rx_pkts,
+						    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
@@ -505,6 +508,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+							      struct rte_mbuf **rx_pkts,
+							      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 9030ca5..cea0163 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -684,7 +684,7 @@
 	return received;
 }
 
-static inline __m256i
+static __rte_always_inline __m256i
 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
 {
 #define FDID_MIS_MAGIC 0xFFFFFFFF
@@ -703,12 +703,16 @@
 	return fdir_flags;
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					struct rte_mbuf **rx_pkts,
-					uint16_t nb_pkts, uint8_t *split_packet)
+					uint16_t nb_pkts,
+					uint8_t *split_packet,
+					bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -812,71 +816,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except Checksum Reports, RSS indication
-	 * and VLAN indication.
-	 * bit6:4 for IP/L4 checksum errors.
-	 * bit12 is for RSS indication.
-	 * bit13 is for VLAN indication.
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 4
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 12.
-	 * If RSS(bit12)/VLAN(bit13) are set,
-	 * shuffle moves appropriate flags in place.
-	 */
-	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0,
-			/* end up 128-bits */
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -940,6 +879,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -961,6 +901,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
@@ -969,6 +910,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -988,6 +930,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -1007,28 +950,114 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-				_mm256_srli_epi32(flag_bits, 4));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-		/* set rss and vlan flags */
-		const __m256i rss_vlan_flag_bits =
-			_mm256_srli_epi32(flag_bits, 12);
-		const __m256i rss_vlan_flags =
-			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
-					    rss_vlan_flag_bits);
-
 		/* merge flags */
-		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-						     rss_vlan_flags);
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except Checksum Reports, RSS indication
+			 * and VLAN indication.
+			 * bit6:4 for IP/L4 checksum errors.
+			 * bit12 is for RSS indication.
+			 * bit13 is for VLAN indication.
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 4
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 12.
+			 * If RSS(bit12)/VLAN(bit13) are set,
+			 * shuffle moves appropriate flags in place.
+			 */
+			const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_RSS_HASH, 0,
+					/* end up 128-bits */
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_RSS_HASH, 0);
+#endif
 
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+					_mm256_srli_epi32(flag_bits, 4));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* set rss and vlan flags */
+			const __m256i rss_vlan_flag_bits =
+				_mm256_srli_epi32(flag_bits, 12);
+			const __m256i rss_vlan_flags =
+				_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+						    rss_vlan_flag_bits);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags);
+#endif
+		}
+
+#ifdef IAVF_RX_FDIR_OFFLOAD
 		if (rxq->fdir_enabled) {
 			const __m512i fdir_permute_mask = _mm512_set_epi32
 				(0, 0, 0, 0,
@@ -1070,6 +1099,7 @@
 			rx_pkts[i + 7]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 4);
 		} /* if() on fdir_enabled */
+#endif
 
 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
@@ -1077,93 +1107,97 @@
 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-		/**
-		 * needs to load 2nd 16B of each desc for RSS hash parsing,
-		 * will cause performance drop to get into this context.
-		 */
-		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
-		    DEV_RX_OFFLOAD_RSS_HASH) {
-			/* load bottom half of every 32B desc */
-			const __m128i raw_desc_bh7 =
-				_mm_load_si128
-					((void *)(&rxdp[7].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh6 =
-				_mm_load_si128
-					((void *)(&rxdp[6].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh5 =
-				_mm_load_si128
-					((void *)(&rxdp[5].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh4 =
-				_mm_load_si128
-					((void *)(&rxdp[4].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh3 =
-				_mm_load_si128
-					((void *)(&rxdp[3].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh2 =
-				_mm_load_si128
-					((void *)(&rxdp[2].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh1 =
-				_mm_load_si128
-					((void *)(&rxdp[1].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh0 =
-				_mm_load_si128
-					((void *)(&rxdp[0].wb.status_error1));
-
-			__m256i raw_desc_bh6_7 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh6),
-					 raw_desc_bh7, 1);
-			__m256i raw_desc_bh4_5 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh4),
-					 raw_desc_bh5, 1);
-			__m256i raw_desc_bh2_3 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh2),
-					 raw_desc_bh3, 1);
-			__m256i raw_desc_bh0_1 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh0),
-					 raw_desc_bh1, 1);
-
+		if (offload) {
+#ifdef IAVF_RX_RSS_OFFLOAD
 			/**
-			 * to shift the 32b RSS hash value to the
-			 * highest 32b of each 128b before mask
+			 * needs to load 2nd 16B of each desc for RSS hash parsing,
+			 * will cause performance drop to get into this context.
 			 */
-			__m256i rss_hash6_7 =
-				_mm256_slli_epi64(raw_desc_bh6_7, 32);
-			__m256i rss_hash4_5 =
-				_mm256_slli_epi64(raw_desc_bh4_5, 32);
-			__m256i rss_hash2_3 =
-				_mm256_slli_epi64(raw_desc_bh2_3, 32);
-			__m256i rss_hash0_1 =
-				_mm256_slli_epi64(raw_desc_bh0_1, 32);
-
-			__m256i rss_hash_msk =
-				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
-						 0xFFFFFFFF, 0, 0, 0);
-
-			rss_hash6_7 = _mm256_and_si256
-					(rss_hash6_7, rss_hash_msk);
-			rss_hash4_5 = _mm256_and_si256
-					(rss_hash4_5, rss_hash_msk);
-			rss_hash2_3 = _mm256_and_si256
-					(rss_hash2_3, rss_hash_msk);
-			rss_hash0_1 = _mm256_and_si256
-					(rss_hash0_1, rss_hash_msk);
-
-			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
-			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
-			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
-			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
-		} /* if() on RSS hash parsing */
+			if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+			    DEV_RX_OFFLOAD_RSS_HASH) {
+				/* load bottom half of every 32B desc */
+				const __m128i raw_desc_bh7 =
+					_mm_load_si128
+						((void *)(&rxdp[7].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh6 =
+					_mm_load_si128
+						((void *)(&rxdp[6].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh5 =
+					_mm_load_si128
+						((void *)(&rxdp[5].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh4 =
+					_mm_load_si128
+						((void *)(&rxdp[4].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh3 =
+					_mm_load_si128
+						((void *)(&rxdp[3].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh2 =
+					_mm_load_si128
+						((void *)(&rxdp[2].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh1 =
+					_mm_load_si128
+						((void *)(&rxdp[1].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh0 =
+					_mm_load_si128
+						((void *)(&rxdp[0].wb.status_error1));
+
+				__m256i raw_desc_bh6_7 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh6),
+						 raw_desc_bh7, 1);
+				__m256i raw_desc_bh4_5 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh4),
+						 raw_desc_bh5, 1);
+				__m256i raw_desc_bh2_3 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh2),
+						 raw_desc_bh3, 1);
+				__m256i raw_desc_bh0_1 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh0),
+						 raw_desc_bh1, 1);
+
+				/**
+				 * to shift the 32b RSS hash value to the
+				 * highest 32b of each 128b before mask
+				 */
+				__m256i rss_hash6_7 =
+					_mm256_slli_epi64(raw_desc_bh6_7, 32);
+				__m256i rss_hash4_5 =
+					_mm256_slli_epi64(raw_desc_bh4_5, 32);
+				__m256i rss_hash2_3 =
+					_mm256_slli_epi64(raw_desc_bh2_3, 32);
+				__m256i rss_hash0_1 =
+					_mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+				__m256i rss_hash_msk =
+					_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+							 0xFFFFFFFF, 0, 0, 0);
+
+				rss_hash6_7 = _mm256_and_si256
+						(rss_hash6_7, rss_hash_msk);
+				rss_hash4_5 = _mm256_and_si256
+						(rss_hash4_5, rss_hash_msk);
+				rss_hash2_3 = _mm256_and_si256
+						(rss_hash2_3, rss_hash_msk);
+				rss_hash0_1 = _mm256_and_si256
+						(rss_hash0_1, rss_hash_msk);
+
+				mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+				mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+				mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+				mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+			} /* if() on RSS hash parsing */
+#endif
+		}
 #endif
 
 		/**
@@ -1330,7 +1364,7 @@
 				   uint16_t nb_pkts)
 {
 	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts,
-						       nb_pkts, NULL);
+						       nb_pkts, NULL, false);
 }
 
 /**
@@ -1412,17 +1446,18 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue,
 					      struct rte_mbuf **rx_pkts,
-					      uint16_t nb_pkts)
+					      uint16_t nb_pkts,
+					      bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq,
-					rx_pkts, nb_pkts, split_flags);
+					rx_pkts, nb_pkts, split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1455,10 +1490,11 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
-					     struct rte_mbuf **rx_pkts,
-					     uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(void *rx_queue,
+						 struct rte_mbuf **rx_pkts,
+						 uint16_t nb_pkts,
+						 bool offload)
 {
 	uint16_t retval = 0;
 
@@ -1466,14 +1502,25 @@
 		uint16_t burst =
 			iavf_recv_scattered_burst_vec_avx512_flex_rxd
 				(rx_queue, rx_pkts + retval,
-				 IAVF_VPMD_RX_MAX_BURST);
+				 IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
+								rx_pkts,
+								nb_pkts,
+								false);
 }
 
 uint16_t
@@ -1493,6 +1540,29 @@
 						       nb_pkts, true);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue,
+						       rx_pkts,
+						       nb_pkts,
+						       NULL,
+						       true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
+								rx_pkts,
+								nb_pkts,
+								true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
                       ` (3 preceding siblings ...)
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
@ 2021-04-09  7:44     ` Zhang, Qi Z
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
  5 siblings, 0 replies; 30+ messages in thread
From: Zhang, Qi Z @ 2021-04-09  7:44 UTC (permalink / raw)
  To: Lu, Wenzhuo, dev; +Cc: Lu, Wenzhuo



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Wenzhuo Lu
> Sent: Friday, April 9, 2021 1:59 PM
> To: dev@dpdk.org
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: [dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512
> 
> Add specific paths for RX/TX AVX512, called offload paths.
> In these paths, support the HW offload features, like, checksum, VLAN, RSS
> offload.
> These paths are chosen automatically according to the configuration.
> 
> v2:
>  - Fixed compile error.
> 
> v3:
>  - Used 'inline' to drop the duplicate code.
>  - some minor change.
> 
> v4:
>  - Rebased on next-net-intel.
> 
> Wenzhuo Lu (4):
>   net/iavf: store offload flag of Rx queue
>   net/iavf: add offload path for Tx AVX512
>   net/iavf: add offload path for Rx AVX512
>   net/iavf: add offload path for Rx AVX512 flex desc
> 
>  doc/guides/rel_notes/release_21_05.rst  |   7 +
>  drivers/net/iavf/iavf_rxtx.c            | 187 +++++--
>  drivers/net/iavf/iavf_rxtx.h            |  33 +-
>  drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899
> +++++++++++++++++++-------------  drivers/net/iavf/iavf_rxtx_vec_common.h |
> 115 +++-
>  5 files changed, 819 insertions(+), 422 deletions(-)
> 
> --
> 1.9.3

Applied to dpdk-next-net-intel.

Thanks
Qi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
@ 2021-04-13 12:45       ` Ferruh Yigit
  2021-04-14  1:22         ` Lu, Wenzhuo
  0 siblings, 1 reply; 30+ messages in thread
From: Ferruh Yigit @ 2021-04-13 12:45 UTC (permalink / raw)
  To: Wenzhuo Lu, dev

On 4/9/2021 6:59 AM, Wenzhuo Lu wrote:
> Add a specific path for RX AVX512 (flexible descriptor).
> In this path, support the HW offload features, like,
> checksum, VLAN stripping, RSS hash.
> This path is chosen automatically according to the
> configuration.
> 
> 'inline' is used, then the duplicate code is generated
> by the compiler.
> 
> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> ---
>   doc/guides/rel_notes/release_21_05.rst  |   7 +
>   drivers/net/iavf/iavf_rxtx.c            |  27 +-
>   drivers/net/iavf/iavf_rxtx.h            |   6 +
>   drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++++++++++++++++++--------------
>   4 files changed, 283 insertions(+), 193 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
> index 95713f2..253917a 100644
> --- a/doc/guides/rel_notes/release_21_05.rst
> +++ b/doc/guides/rel_notes/release_21_05.rst
> @@ -137,6 +137,13 @@ New Features
>     * Added command to display Rx queue used descriptor count.
>       ``show port (port_id) rxq (queue_id) desc used count``
>   
> +* **Added the offload paths for IAVF AVX512.**
> +
> +  * Added the new RX and TX paths to use the HW offload features. When the HW
> +    offload features are configured to be used, the offload paths are chosen
> +    automatically.
> +  * The code of HW offload features is removed from the legacy paths.
> +
>   

There is already a iavf bullet in the section, "Updated Intel iavf driver", can 
you please add updates under it?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-04-13 12:45       ` Ferruh Yigit
@ 2021-04-14  1:22         ` Lu, Wenzhuo
  0 siblings, 0 replies; 30+ messages in thread
From: Lu, Wenzhuo @ 2021-04-14  1:22 UTC (permalink / raw)
  To: Yigit, Ferruh, dev

Hi Ferruh,

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Tuesday, April 13, 2021 8:45 PM
> To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx
> AVX512 flex desc
> 
> On 4/9/2021 6:59 AM, Wenzhuo Lu wrote:
> > Add a specific path for RX AVX512 (flexible descriptor).
> > In this path, support the HW offload features, like, checksum, VLAN
> > stripping, RSS hash.
> > This path is chosen automatically according to the configuration.
> >
> > 'inline' is used, then the duplicate code is generated by the
> > compiler.
> >
> > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > ---
> >   doc/guides/rel_notes/release_21_05.rst  |   7 +
> >   drivers/net/iavf/iavf_rxtx.c            |  27 +-
> >   drivers/net/iavf/iavf_rxtx.h            |   6 +
> >   drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++++++++++++++++++--------
> ------
> >   4 files changed, 283 insertions(+), 193 deletions(-)
> >
> > diff --git a/doc/guides/rel_notes/release_21_05.rst
> > b/doc/guides/rel_notes/release_21_05.rst
> > index 95713f2..253917a 100644
> > --- a/doc/guides/rel_notes/release_21_05.rst
> > +++ b/doc/guides/rel_notes/release_21_05.rst
> > @@ -137,6 +137,13 @@ New Features
> >     * Added command to display Rx queue used descriptor count.
> >       ``show port (port_id) rxq (queue_id) desc used count``
> >
> > +* **Added the offload paths for IAVF AVX512.**
> > +
> > +  * Added the new RX and TX paths to use the HW offload features. When
> the HW
> > +    offload features are configured to be used, the offload paths are
> chosen
> > +    automatically.
> > +  * The code of HW offload features is removed from the legacy paths.
> > +
> >
> 
> There is already a iavf bullet in the section, "Updated Intel iavf driver", can
> you please add updates under it?
Will move this to there. Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512
  2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
                       ` (4 preceding siblings ...)
  2021-04-09  7:44     ` [dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512 Zhang, Qi Z
@ 2021-04-14  7:34     ` Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
                         ` (4 more replies)
  5 siblings, 5 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-14  7:34 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS offload.
These paths are chosen automatically according to the configuration.

v2:
 - Fixed compile error.

v3:
 - Used 'inline' to drop the duplicate code.
 - some minor change.

v4:
 - Rebased on next-net-intel.

v5:
 - Minor change of release note.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |   5 +
 drivers/net/iavf/iavf_rxtx.c            | 187 +++++--
 drivers/net/iavf/iavf_rxtx.h            |  33 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
 5 files changed, 817 insertions(+), 422 deletions(-)

-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v5 1/4] net/iavf: store offload flag of Rx queue
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
@ 2021-04-14  7:34       ` Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-14  7:34 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c | 4 ++++
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 541b444..bd0b7ee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -523,9 +523,12 @@
 	uint8_t proto_xtr;
 	uint16_t len;
 	uint16_t rx_free_thresh;
+	uint64_t offloads;
 
 	PMD_INIT_FUNC_TRACE();
 
+	offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
 	if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
 	    nb_desc > IAVF_MAX_RING_DESC ||
 	    nb_desc < IAVF_MIN_RING_DESC) {
@@ -596,6 +599,7 @@
 	rxq->rx_deferred_start = rx_conf->rx_deferred_start;
 	rxq->rx_hdr_len = 0;
 	rxq->vsi = vsi;
+	rxq->offloads = offloads;
 
 	if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
 		rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 4fbd847..f56dd74 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
 		/* flexible descriptor metadata extraction offload flag */
 	iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
 				/* handle flexible descriptor by RXDID */
+	uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v5 2/4] net/iavf: add offload path for Tx AVX512
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
@ 2021-04-14  7:34       ` Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
                         ` (2 subsequent siblings)
  4 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-14  7:34 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  57 +++++++++++------
 drivers/net/iavf/iavf_rxtx.h            |  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++++++++++++++++++++++++++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bd0b7ee..099ede7 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -160,7 +160,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2498,17 +2498,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2516,15 +2522,29 @@
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH) {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+			} else {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).",
+					    dev->data->port_id);
+			}
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2544,8 +2564,9 @@
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index f56dd74..bead119 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,21 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
+		DEV_TX_OFFLOAD_IPV4_CKSUM |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -488,6 +495,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 385f44e..f4dd222 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1518,14 +1518,16 @@
 		txep[i].mbuf = tx_pkts[i];
 }
 
-static inline void
+static __rte_always_inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
-	  struct rte_mbuf *pkt, uint64_t flags)
+	  struct rte_mbuf *pkt, uint64_t flags, bool offload)
 {
 	uint64_t high_qw =
 		(IAVF_TX_DESC_DTYPE_DATA |
 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	if (offload)
+		iavf_txd_enable_offload(pkt, &high_qw);
 
 	__m128i descriptor = _mm_set_epi64x(high_qw,
 					    pkt->buf_iova + pkt->data_off);
@@ -1534,62 +1536,70 @@
 
 #define IAVF_TX_LEN_MASK 0xAA
 #define IAVF_TX_OFF_MASK 0x55
-static inline void
+static __rte_always_inline void
 iavf_vtx(volatile struct iavf_tx_desc *txdp,
-	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
+	 bool offload)
 {
 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
 
 	/* if unaligned on 32-bit boundary, do one to align */
 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		nb_pkts--, txdp++, pkt++;
 	}
 
 	/* do 4 at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
-		__m512i desc4 =
-			_mm512_set_epi64
-				((uint64_t)pkt[3]->data_len,
-				 pkt[3]->buf_iova,
-				 (uint64_t)pkt[2]->data_len,
-				 pkt[2]->buf_iova,
-				 (uint64_t)pkt[1]->data_len,
-				 pkt[1]->buf_iova,
-				 (uint64_t)pkt[0]->data_len,
-				 pkt[0]->buf_iova);
-		__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
-		__m512i data_off_4 =
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
 			_mm512_set_epi64
-				(0,
-				 pkt[3]->data_off,
-				 0,
-				 pkt[2]->data_off,
-				 0,
-				 pkt[1]->data_off,
-				 0,
-				 pkt[0]->data_off);
-
-		desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					       IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
-		desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					     hi_qw_tmpl_4);
-		desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
-					      data_off_4);
-		_mm512_storeu_si512((void *)txdp, desc4);
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
 	while (nb_pkts) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		txdp++, pkt++, nb_pkts--;
 	}
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-				 uint16_t nb_pkts)
+				 uint16_t nb_pkts, bool offload)
 {
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 	volatile struct iavf_tx_desc *txdp;
@@ -1620,11 +1630,11 @@
 	if (nb_commit >= n) {
 		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
-		iavf_vtx(txdp, tx_pkts, n - 1, flags);
+		iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
 		tx_pkts += (n - 1);
 		txdp += (n - 1);
 
-		iavf_vtx1(txdp, *tx_pkts++, rs);
+		iavf_vtx1(txdp, *tx_pkts++, rs, offload);
 
 		nb_commit = (uint16_t)(nb_commit - n);
 
@@ -1639,7 +1649,7 @@
 
 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
-	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
 	if (tx_id > txq->next_rs) {
@@ -1657,9 +1667,9 @@
 	return nb_pkts;
 }
 
-uint16_t
-iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-			  uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts, bool offload)
 {
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
@@ -1669,7 +1679,7 @@
 
 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
-						       num);
+						       num, offload);
 		nb_tx += ret;
 		nb_pkts -= ret;
 		if (ret < num)
@@ -1679,6 +1689,13 @@
 	return nb_tx;
 }
 
+uint16_t
+iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
+}
+
 static inline void
 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
 {
@@ -1709,3 +1726,10 @@
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 816e16a..62a333f 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -240,14 +240,17 @@
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -270,14 +273,97 @@
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static __rte_always_inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #ifdef CC_AVX2_SUPPORT
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v5 3/4] net/iavf: add offload path for Rx AVX512
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
@ 2021-04-14  7:34       ` Wenzhuo Lu
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
  2021-04-14 12:53       ` [dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512 Zhang, Qi Z
  4 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-14  7:34 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            | 105 +++++++---
 drivers/net/iavf/iavf_rxtx.h            |  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 353 ++++++++++++++++++++------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 324 insertions(+), 163 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 099ede7..ca01ed9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2392,22 +2392,23 @@
 #ifdef RTE_ARCH_X86
 	struct iavf_rx_queue *rxq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
+	bool use_flex = false;
 
-	if (!iavf_rx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		for (i = 0; i < dev->data->nb_rx_queues; i++) {
-			rxq = dev->data->rx_queues[i];
-			(void)iavf_rxq_vec_setup(rxq);
+	check_ret = iavf_rx_vec_dev_check(dev);
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2415,13 +2416,38 @@
 			use_avx512 = true;
 #endif
 
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (vf->vf_res->vf_cap_flags &
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			use_flex = true;
+			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+				use_flex = false;
+		}
+
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)iavf_rxq_vec_setup(rxq);
+		}
+
 		if (dev->data->scattered_rx) {
-			PMD_DRV_LOG(DEBUG,
-				    "Using %sVector Scattered Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG,
+					    "Using %sVector Scattered Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2435,17 +2461,32 @@
 					iavf_recv_scattered_pkts_vec_avx2 :
 					iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		} else {
-			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
@@ -2459,17 +2500,23 @@
 					iavf_recv_pkts_vec_avx2 :
 					iavf_recv_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		}
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	if (dev->data->scattered_rx) {
 		PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
 			    dev->data->port_id);
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index bead119..a8e5664 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -35,6 +35,12 @@
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_RX_VECTOR_OFFLOAD (				 \
+		DEV_RX_OFFLOAD_CHECKSUM |		 \
+		DEV_RX_OFFLOAD_SCTP_CKSUM |		 \
+		DEV_RX_OFFLOAD_VLAN |		 \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
 #define IAVF_VECTOR_PATH 0
 #define IAVF_VECTOR_OFFLOAD_PATH 1
 
@@ -484,12 +490,18 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index f4dd222..d1ca667 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -13,6 +13,22 @@
 #define IAVF_DESCS_PER_LOOP_AVX 8
 #define PKTLEN_SHIFT 10
 
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in RX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ stripping
+ * 3, RSS hash
+ * 4, packet type analysis
+ * 5, flow director ID report
+ ******************************************************************************/
+#define IAVF_RX_CSUM_OFFLOAD
+#define IAVF_RX_VLAN_OFFLOAD
+#define IAVF_RX_RSS_OFFLOAD
+#define IAVF_RX_PTYPE_OFFLOAD
+#define IAVF_RX_FDIR_OFFLOAD
+
 static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
@@ -144,12 +160,15 @@
 }
 
 #define IAVF_RX_LEN_MASK 0x80808080
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       struct rte_mbuf **rx_pkts,
-			       uint16_t nb_pkts, uint8_t *split_packet)
+			       uint16_t nb_pkts, uint8_t *split_packet,
+			       bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -252,71 +271,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except RSS, flow director and VLAN flags
-	 * bit2 is for VLAN tag, bit11 for flow director indication
-	 * bit13:12 for RSS indication. Bits 3-5 of error
-	 * field (bits 22-24) are for IP/L4 checksum errors
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((1 << 2) | (1 << 11) |
-				  (3 << 12) | (7 << 22));
-	/**
-	 * data to be shuffled by result of flag mask. If VLAN bit is set,
-	 * (bit 2), then position 4 in this array will be used in the
-	 * destination
-	 */
-	const __m256i vlan_flags_shuf =
-		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
-				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 11.
-	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
-	 * place.
-	 */
-	const __m256i rss_flags_shuf =
-		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
-				0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0);
-
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 22
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
-
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -384,6 +338,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, shift 64-bit values down 30 bits
 		 * and so ptype is in lower 8-bits in each
@@ -402,6 +357,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, adjusting length and
@@ -415,6 +371,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/* get the packet types */
 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
@@ -430,6 +387,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -449,27 +407,122 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/* set vlan and rss flags */
-		const __m256i vlan_flags =
-			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags =
-			_mm256_shuffle_epi8(rss_flags_shuf,
-					    _mm256_srli_epi32(flag_bits, 11));
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-						_mm256_srli_epi32(flag_bits, 22));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-
 		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-				_mm256_or_si256(rss_flags, vlan_flags));
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except RSS, flow director and VLAN flags
+			 * bit2 is for VLAN tag, bit11 for flow director indication
+			 * bit13:12 for RSS indication. Bits 3-5 of error
+			 * field (bits 22-24) are for IP/L4 checksum errors
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((1 << 2) | (1 << 11) |
+						  (3 << 12) | (7 << 22));
+#endif
+
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask. If VLAN bit is set,
+			 * (bit 2), then position 4 in this array will be used in the
+			 * destination
+			 */
+			const __m256i vlan_flags_shuf =
+				_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+						 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+#endif
+
+#ifdef IAVF_RX_RSS_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 11.
+			 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+			 * place.
+			 */
+			const __m256i rss_flags_shuf =
+				_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
+						0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 22
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+			/* set vlan and rss flags */
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			const __m256i vlan_flags =
+				_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			const __m256i rss_flags =
+				_mm256_shuffle_epi8(rss_flags_shuf,
+						    _mm256_srli_epi32(flag_bits, 11));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+							_mm256_srli_epi32(flag_bits, 22));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
+#endif
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
+#endif
+		}
+
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -487,7 +540,7 @@
 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
 				 RTE_ALIGN(offsetof(struct rte_mbuf,
 						    rearm_data),
-					   16));
+						    16));
 		/* build up data and do writes */
 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
 			rearm6, rearm7;
@@ -496,21 +549,28 @@
 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-		rearm6 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 8),
-					    0x04);
-		rearm4 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 4),
-					    0x04);
-		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
-		rearm0 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(mbuf_flags, 4),
-					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
-		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
-		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
-		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
-		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		if (offload) {
+			rearm6 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 8),
+						    0x04);
+			rearm4 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 4),
+						    0x04);
+			rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+			rearm0 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(mbuf_flags, 4),
+						    0x04);
+			/* permute to add in the rx_descriptor e.g. rss fields */
+			rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		} else {
+			rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
+		}
 		/* write to mbuf */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 				    rearm6);
@@ -522,24 +582,31 @@
 				    rearm0);
 
 		/* repeat for the odd mbufs */
-		const __m256i odd_flags =
-			_mm256_castsi128_si256
-				(_mm256_extracti128_si256(mbuf_flags, 1));
-		rearm7 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 8),
-					    0x04);
-		rearm5 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 4),
-					    0x04);
-		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
-		rearm1 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(odd_flags, 4),
-					    0x04);
-		/* since odd mbufs are already in hi 128-bits use blend */
-		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
-		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
-		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
-		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		if (offload) {
+			const __m256i odd_flags =
+				_mm256_castsi128_si256
+					(_mm256_extracti128_si256(mbuf_flags, 1));
+			rearm7 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 8),
+						    0x04);
+			rearm5 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 4),
+						    0x04);
+			rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+			rearm1 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(odd_flags, 4),
+						    0x04);
+			/* since odd mbufs are already in hi 128-bits use blend */
+			rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		} else {
+			rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
+		}
 		/* again write to mbufs */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 				    rearm7);
@@ -1250,7 +1317,8 @@
 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts)
 {
-	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts,
+					      NULL, false);
 }
 
 /**
@@ -1270,16 +1338,16 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				     uint16_t nb_pkts)
+				     uint16_t nb_pkts, bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
-							  split_flags);
+							  split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1312,22 +1380,30 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				    uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkts,
+					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, false);
 }
 
 /**
@@ -1400,6 +1476,23 @@
 				rx_pkts + retval, nb_pkts);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts,
+					      nb_pkts, NULL, true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 62a333f..6bb6903 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -15,7 +15,7 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 reassemble_packets(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_bufs,
 		   uint16_t nb_bufs, uint8_t *split_flags)
 {
@@ -231,7 +231,10 @@
 	if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE)
 		return -1;
 
-	return 0;
+	if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -258,14 +261,20 @@
 {
 	int i;
 	struct iavf_rx_queue *rxq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		if (iavf_rx_vec_queue_default(rxq))
+		ret = iavf_rx_vec_queue_default(rxq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
 }
 
 static inline int
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v5 4/4] net/iavf: add offload path for Rx AVX512 flex desc
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
                         ` (2 preceding siblings ...)
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
@ 2021-04-14  7:34       ` Wenzhuo Lu
  2021-04-14 12:53       ` [dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512 Zhang, Qi Z
  4 siblings, 0 replies; 30+ messages in thread
From: Wenzhuo Lu @ 2021-04-14  7:34 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu

Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst  |   5 +
 drivers/net/iavf/iavf_rxtx.c            |  27 +-
 drivers/net/iavf/iavf_rxtx.h            |   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++++++++++++++++++--------------
 4 files changed, 281 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 9a666b6..a6aa5c9 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -109,6 +109,11 @@ New Features
 
   * Added flow filter to support GTPU inner L3/L4 fields matching.
 
+  * In AVX512 code, added the new RX and TX paths to use the HW offload
+    features. When the HW offload features are configured to be used, the
+    offload paths are chosen automatically. In parallel the support of HW
+    offload features is removed from the legacy AVX512 paths.
+
 * **Updated Intel ice driver.**
 
   * Added Intel ice support on Windows.
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index ca01ed9..3f3cf63 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2420,11 +2420,8 @@
 			goto normal;
 
 		if (vf->vf_res->vf_cap_flags &
-			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
 			use_flex = true;
-			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-				use_flex = false;
-		}
 
 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
 			rxq = dev->data->rx_queues[i];
@@ -2452,9 +2449,14 @@
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
@@ -2491,9 +2493,14 @@
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512_flex_rxd;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+				}
 #endif
 			} else {
 				dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index a8e5664..19b6028 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -496,6 +496,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						    struct rte_mbuf **rx_pkts,
+						    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
@@ -505,6 +508,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+							      struct rte_mbuf **rx_pkts,
+							      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index d1ca667..7faa1f4 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -684,7 +684,7 @@
 	return received;
 }
 
-static inline __m256i
+static __rte_always_inline __m256i
 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
 {
 #define FDID_MIS_MAGIC 0xFFFFFFFF
@@ -703,12 +703,16 @@
 	return fdir_flags;
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					struct rte_mbuf **rx_pkts,
-					uint16_t nb_pkts, uint8_t *split_packet)
+					uint16_t nb_pkts,
+					uint8_t *split_packet,
+					bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -812,71 +816,6 @@
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except Checksum Reports, RSS indication
-	 * and VLAN indication.
-	 * bit6:4 for IP/L4 checksum errors.
-	 * bit12 is for RSS indication.
-	 * bit13 is for VLAN indication.
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 4
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
-			 PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 12.
-	 * If RSS(bit12)/VLAN(bit13) are set,
-	 * shuffle moves appropriate flags in place.
-	 */
-	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0,
-			/* end up 128-bits */
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			0, 0, 0, 0,
-			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			PKT_RX_RSS_HASH, 0);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -940,6 +879,7 @@
 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -961,6 +901,7 @@
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
@@ -969,6 +910,7 @@
 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, ptype is located in bit16-25
 		 * of each 128bits
@@ -988,6 +930,7 @@
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -1007,28 +950,114 @@
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-				_mm256_srli_epi32(flag_bits, 4));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-		/* set rss and vlan flags */
-		const __m256i rss_vlan_flag_bits =
-			_mm256_srli_epi32(flag_bits, 12);
-		const __m256i rss_vlan_flags =
-			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
-					    rss_vlan_flag_bits);
-
 		/* merge flags */
-		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-						     rss_vlan_flags);
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except Checksum Reports, RSS indication
+			 * and VLAN indication.
+			 * bit6:4 for IP/L4 checksum errors.
+			 * bit12 is for RSS indication.
+			 * bit13 is for VLAN indication.
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 4
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+					 PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 12.
+			 * If RSS(bit12)/VLAN(bit13) are set,
+			 * shuffle moves appropriate flags in place.
+			 */
+			const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_RSS_HASH, 0,
+					/* end up 128-bits */
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+					PKT_RX_RSS_HASH, 0);
+#endif
 
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+					_mm256_srli_epi32(flag_bits, 4));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* set rss and vlan flags */
+			const __m256i rss_vlan_flag_bits =
+				_mm256_srli_epi32(flag_bits, 12);
+			const __m256i rss_vlan_flags =
+				_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+						    rss_vlan_flag_bits);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags);
+#endif
+		}
+
+#ifdef IAVF_RX_FDIR_OFFLOAD
 		if (rxq->fdir_enabled) {
 			const __m512i fdir_permute_mask = _mm512_set_epi32
 				(0, 0, 0, 0,
@@ -1070,6 +1099,7 @@
 			rx_pkts[i + 7]->hash.fdir.hi =
 				_mm256_extract_epi32(fdir_id0_7, 4);
 		} /* if() on fdir_enabled */
+#endif
 
 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
@@ -1077,93 +1107,97 @@
 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-		/**
-		 * needs to load 2nd 16B of each desc for RSS hash parsing,
-		 * will cause performance drop to get into this context.
-		 */
-		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
-		    DEV_RX_OFFLOAD_RSS_HASH) {
-			/* load bottom half of every 32B desc */
-			const __m128i raw_desc_bh7 =
-				_mm_load_si128
-					((void *)(&rxdp[7].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh6 =
-				_mm_load_si128
-					((void *)(&rxdp[6].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh5 =
-				_mm_load_si128
-					((void *)(&rxdp[5].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh4 =
-				_mm_load_si128
-					((void *)(&rxdp[4].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh3 =
-				_mm_load_si128
-					((void *)(&rxdp[3].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh2 =
-				_mm_load_si128
-					((void *)(&rxdp[2].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh1 =
-				_mm_load_si128
-					((void *)(&rxdp[1].wb.status_error1));
-			rte_compiler_barrier();
-			const __m128i raw_desc_bh0 =
-				_mm_load_si128
-					((void *)(&rxdp[0].wb.status_error1));
-
-			__m256i raw_desc_bh6_7 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh6),
-					 raw_desc_bh7, 1);
-			__m256i raw_desc_bh4_5 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh4),
-					 raw_desc_bh5, 1);
-			__m256i raw_desc_bh2_3 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh2),
-					 raw_desc_bh3, 1);
-			__m256i raw_desc_bh0_1 =
-				_mm256_inserti128_si256
-					(_mm256_castsi128_si256(raw_desc_bh0),
-					 raw_desc_bh1, 1);
-
+		if (offload) {
+#ifdef IAVF_RX_RSS_OFFLOAD
 			/**
-			 * to shift the 32b RSS hash value to the
-			 * highest 32b of each 128b before mask
+			 * needs to load 2nd 16B of each desc for RSS hash parsing,
+			 * will cause performance drop to get into this context.
 			 */
-			__m256i rss_hash6_7 =
-				_mm256_slli_epi64(raw_desc_bh6_7, 32);
-			__m256i rss_hash4_5 =
-				_mm256_slli_epi64(raw_desc_bh4_5, 32);
-			__m256i rss_hash2_3 =
-				_mm256_slli_epi64(raw_desc_bh2_3, 32);
-			__m256i rss_hash0_1 =
-				_mm256_slli_epi64(raw_desc_bh0_1, 32);
-
-			__m256i rss_hash_msk =
-				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
-						 0xFFFFFFFF, 0, 0, 0);
-
-			rss_hash6_7 = _mm256_and_si256
-					(rss_hash6_7, rss_hash_msk);
-			rss_hash4_5 = _mm256_and_si256
-					(rss_hash4_5, rss_hash_msk);
-			rss_hash2_3 = _mm256_and_si256
-					(rss_hash2_3, rss_hash_msk);
-			rss_hash0_1 = _mm256_and_si256
-					(rss_hash0_1, rss_hash_msk);
-
-			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
-			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
-			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
-			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
-		} /* if() on RSS hash parsing */
+			if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+			    DEV_RX_OFFLOAD_RSS_HASH) {
+				/* load bottom half of every 32B desc */
+				const __m128i raw_desc_bh7 =
+					_mm_load_si128
+						((void *)(&rxdp[7].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh6 =
+					_mm_load_si128
+						((void *)(&rxdp[6].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh5 =
+					_mm_load_si128
+						((void *)(&rxdp[5].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh4 =
+					_mm_load_si128
+						((void *)(&rxdp[4].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh3 =
+					_mm_load_si128
+						((void *)(&rxdp[3].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh2 =
+					_mm_load_si128
+						((void *)(&rxdp[2].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh1 =
+					_mm_load_si128
+						((void *)(&rxdp[1].wb.status_error1));
+				rte_compiler_barrier();
+				const __m128i raw_desc_bh0 =
+					_mm_load_si128
+						((void *)(&rxdp[0].wb.status_error1));
+
+				__m256i raw_desc_bh6_7 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh6),
+						 raw_desc_bh7, 1);
+				__m256i raw_desc_bh4_5 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh4),
+						 raw_desc_bh5, 1);
+				__m256i raw_desc_bh2_3 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh2),
+						 raw_desc_bh3, 1);
+				__m256i raw_desc_bh0_1 =
+					_mm256_inserti128_si256
+						(_mm256_castsi128_si256(raw_desc_bh0),
+						 raw_desc_bh1, 1);
+
+				/**
+				 * to shift the 32b RSS hash value to the
+				 * highest 32b of each 128b before mask
+				 */
+				__m256i rss_hash6_7 =
+					_mm256_slli_epi64(raw_desc_bh6_7, 32);
+				__m256i rss_hash4_5 =
+					_mm256_slli_epi64(raw_desc_bh4_5, 32);
+				__m256i rss_hash2_3 =
+					_mm256_slli_epi64(raw_desc_bh2_3, 32);
+				__m256i rss_hash0_1 =
+					_mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+				__m256i rss_hash_msk =
+					_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+							 0xFFFFFFFF, 0, 0, 0);
+
+				rss_hash6_7 = _mm256_and_si256
+						(rss_hash6_7, rss_hash_msk);
+				rss_hash4_5 = _mm256_and_si256
+						(rss_hash4_5, rss_hash_msk);
+				rss_hash2_3 = _mm256_and_si256
+						(rss_hash2_3, rss_hash_msk);
+				rss_hash0_1 = _mm256_and_si256
+						(rss_hash0_1, rss_hash_msk);
+
+				mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+				mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+				mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+				mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+			} /* if() on RSS hash parsing */
+#endif
+		}
 #endif
 
 		/**
@@ -1330,7 +1364,7 @@
 				   uint16_t nb_pkts)
 {
 	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts,
-						       nb_pkts, NULL);
+						       nb_pkts, NULL, false);
 }
 
 /**
@@ -1412,17 +1446,18 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue,
 					      struct rte_mbuf **rx_pkts,
-					      uint16_t nb_pkts)
+					      uint16_t nb_pkts,
+					      bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq,
-					rx_pkts, nb_pkts, split_flags);
+					rx_pkts, nb_pkts, split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1455,10 +1490,11 @@
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
-					     struct rte_mbuf **rx_pkts,
-					     uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(void *rx_queue,
+						 struct rte_mbuf **rx_pkts,
+						 uint16_t nb_pkts,
+						 bool offload)
 {
 	uint16_t retval = 0;
 
@@ -1466,14 +1502,25 @@
 		uint16_t burst =
 			iavf_recv_scattered_burst_vec_avx512_flex_rxd
 				(rx_queue, rx_pkts + retval,
-				 IAVF_VPMD_RX_MAX_BURST);
+				 IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
+								rx_pkts,
+								nb_pkts,
+								false);
 }
 
 uint16_t
@@ -1493,6 +1540,29 @@
 						       nb_pkts, true);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue,
+						       rx_pkts,
+						       nb_pkts,
+						       NULL,
+						       true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
+								rx_pkts,
+								nb_pkts,
+								true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
-- 
1.9.3


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512
  2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
                         ` (3 preceding siblings ...)
  2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
@ 2021-04-14 12:53       ` Zhang, Qi Z
  4 siblings, 0 replies; 30+ messages in thread
From: Zhang, Qi Z @ 2021-04-14 12:53 UTC (permalink / raw)
  To: Lu, Wenzhuo, dev; +Cc: Lu, Wenzhuo



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Wenzhuo Lu
> Sent: Wednesday, April 14, 2021 3:34 PM
> To: dev@dpdk.org
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: [dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512
> 
> Add specific paths for RX/TX AVX512, called offload paths.
> In these paths, support the HW offload features, like, checksum, VLAN, RSS
> offload.
> These paths are chosen automatically according to the configuration.
> 
> v2:
>  - Fixed compile error.
> 
> v3:
>  - Used 'inline' to drop the duplicate code.
>  - some minor change.
> 
> v4:
>  - Rebased on next-net-intel.
> 
> v5:
>  - Minor change of release note.
> 
> Wenzhuo Lu (4):
>   net/iavf: store offload flag of Rx queue
>   net/iavf: add offload path for Tx AVX512
>   net/iavf: add offload path for Rx AVX512
>   net/iavf: add offload path for Rx AVX512 flex desc
> 
>  doc/guides/rel_notes/release_21_05.rst  |   5 +
>  drivers/net/iavf/iavf_rxtx.c            | 187 +++++--
>  drivers/net/iavf/iavf_rxtx.h            |  33 +-
>  drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899
> +++++++++++++++++++-------------  drivers/net/iavf/iavf_rxtx_vec_common.h |
> 115 +++-
>  5 files changed, 817 insertions(+), 422 deletions(-)
> 
> --
> 1.9.3

Applied to dpdk-next-net-intel.

Thanks
Qi

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-04-14 12:53 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-17  6:48 [dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
2021-03-17  6:48 ` [dpdk-dev] [PATCH 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
2021-03-17  6:48 ` [dpdk-dev] [PATCH 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
2021-03-17  6:48 ` [dpdk-dev] [PATCH 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
2021-03-17  6:48 ` [dpdk-dev] [PATCH 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
2021-03-18  5:24 ` [dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
2021-03-18  5:24   ` [dpdk-dev] [PATCH v2 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
2021-03-26  1:31 ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Wenzhuo Lu
2021-03-26  1:31   ` [dpdk-dev] [PATCH v3 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
2021-03-26  1:32   ` [dpdk-dev] [PATCH v3 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
     [not found]   ` <DM5PR11MB1787A1E056D50128FDDDDA5D8C749@DM5PR11MB1787.namprd11.prod.outlook.com>
2021-04-08  8:44     ` [dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512 Rong, Leyi
2021-04-09  5:59   ` [dpdk-dev] [PATCH v4 " Wenzhuo Lu
2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
2021-04-09  5:59     ` [dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
2021-04-13 12:45       ` Ferruh Yigit
2021-04-14  1:22         ` Lu, Wenzhuo
2021-04-09  7:44     ` [dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512 Zhang, Qi Z
2021-04-14  7:34     ` [dpdk-dev] [PATCH v5 " Wenzhuo Lu
2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 1/4] net/iavf: store offload flag of Rx queue Wenzhuo Lu
2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 2/4] net/iavf: add offload path for Tx AVX512 Wenzhuo Lu
2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 3/4] net/iavf: add offload path for Rx AVX512 Wenzhuo Lu
2021-04-14  7:34       ` [dpdk-dev] [PATCH v5 4/4] net/iavf: add offload path for Rx AVX512 flex desc Wenzhuo Lu
2021-04-14 12:53       ` [dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512 Zhang, Qi Z

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git