DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e
@ 2020-12-15  2:19 Leyi Rong
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
                   ` (4 more replies)
  0 siblings, 5 replies; 42+ messages in thread
From: Leyi Rong @ 2020-12-15  2:19 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

This patchset aims to support AVX512 vPMD on i40e.
And the changes are only target to AVX512 vector path.

Leyi Rong (3):
  net/i40e: remove devarg use-latest-supported-vec
  net/i40e: add AVX512 vector path
  net/i40e: optimize Tx by using AVX512

 doc/guides/nics/i40e.rst                |    9 -
 drivers/net/i40e/i40e_ethdev.c          |   63 +-
 drivers/net/i40e/i40e_ethdev.h          |    3 -
 drivers/net/i40e/i40e_rxtx.c            |  193 ++--
 drivers/net/i40e/i40e_rxtx.h            |   13 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1130 +++++++++++++++++++++++
 drivers/net/i40e/meson.build            |   24 +
 7 files changed, 1287 insertions(+), 148 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH 1/3] net/i40e: remove devarg use-latest-supported-vec
  2020-12-15  2:19 [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e Leyi Rong
@ 2020-12-15  2:19 ` Leyi Rong
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 2/3] net/i40e: add AVX512 vector path Leyi Rong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Leyi Rong @ 2020-12-15  2:19 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

As eal parameter --force-max-simd-bitwidth is already introduced,
to make it more clear when setting rx/tx function, remove
devarg use-latest-supported-vec support.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
---
 doc/guides/nics/i40e.rst       |   9 ---
 drivers/net/i40e/i40e_ethdev.c |  63 +------------------
 drivers/net/i40e/i40e_ethdev.h |   3 -
 drivers/net/i40e/i40e_rxtx.c   | 107 +++++++++++----------------------
 4 files changed, 35 insertions(+), 147 deletions(-)

diff --git a/doc/guides/nics/i40e.rst b/doc/guides/nics/i40e.rst
index 4e5c4679b8..90fb8a4d6f 100644
--- a/doc/guides/nics/i40e.rst
+++ b/doc/guides/nics/i40e.rst
@@ -209,15 +209,6 @@ Runtime Config Options
   Currently hot-plugging of representor ports is not supported so all required
   representors must be specified on the creation of the PF.
 
-- ``Use latest supported vector`` (default ``disable``)
-
-  Latest supported vector path may not always get the best perf so vector path was
-  recommended to use only on later platform. But users may want the latest vector path
-  since it can get better perf in some real work loading cases. So ``devargs`` param
-  ``use-latest-supported-vec`` is introduced, for example::
-
-  -a 84:00.0,use-latest-supported-vec=1
-
 - ``Enable validation for VF message`` (default ``not enabled``)
 
   The PF counts messages from each VF. If in any period of seconds the message
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index f54769c29d..223eb9950b 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -44,7 +44,6 @@
 #define ETH_I40E_FLOATING_VEB_LIST_ARG	"floating_veb_list"
 #define ETH_I40E_SUPPORT_MULTI_DRIVER	"support-multi-driver"
 #define ETH_I40E_QUEUE_NUM_PER_VF_ARG	"queue-num-per-vf"
-#define ETH_I40E_USE_LATEST_VEC	"use-latest-supported-vec"
 #define ETH_I40E_VF_MSG_CFG		"vf_msg_cfg"
 
 #define I40E_CLEAR_PXE_WAIT_MS     200
@@ -403,7 +402,6 @@ static const char *const valid_keys[] = {
 	ETH_I40E_FLOATING_VEB_LIST_ARG,
 	ETH_I40E_SUPPORT_MULTI_DRIVER,
 	ETH_I40E_QUEUE_NUM_PER_VF_ARG,
-	ETH_I40E_USE_LATEST_VEC,
 	ETH_I40E_VF_MSG_CFG,
 	NULL};
 
@@ -1301,62 +1299,6 @@ i40e_aq_debug_write_global_register(struct i40e_hw *hw,
 	return i40e_aq_debug_write_register(hw, reg_addr, reg_val, cmd_details);
 }
 
-static int
-i40e_parse_latest_vec_handler(__rte_unused const char *key,
-				const char *value,
-				void *opaque)
-{
-	struct i40e_adapter *ad = opaque;
-	int use_latest_vec;
-
-	use_latest_vec = atoi(value);
-
-	if (use_latest_vec != 0 && use_latest_vec != 1)
-		PMD_DRV_LOG(WARNING, "Value should be 0 or 1, set it as 1!");
-
-	ad->use_latest_vec = (uint8_t)use_latest_vec;
-
-	return 0;
-}
-
-static int
-i40e_use_latest_vec(struct rte_eth_dev *dev)
-{
-	struct i40e_adapter *ad =
-		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
-	struct rte_kvargs *kvlist;
-	int kvargs_count;
-
-	ad->use_latest_vec = false;
-
-	if (!dev->device->devargs)
-		return 0;
-
-	kvlist = rte_kvargs_parse(dev->device->devargs->args, valid_keys);
-	if (!kvlist)
-		return -EINVAL;
-
-	kvargs_count = rte_kvargs_count(kvlist, ETH_I40E_USE_LATEST_VEC);
-	if (!kvargs_count) {
-		rte_kvargs_free(kvlist);
-		return 0;
-	}
-
-	if (kvargs_count > 1)
-		PMD_DRV_LOG(WARNING, "More than one argument \"%s\" and only "
-			    "the first invalid or last valid one is used !",
-			    ETH_I40E_USE_LATEST_VEC);
-
-	if (rte_kvargs_process(kvlist, ETH_I40E_USE_LATEST_VEC,
-				i40e_parse_latest_vec_handler, ad) < 0) {
-		rte_kvargs_free(kvlist);
-		return -EINVAL;
-	}
-
-	rte_kvargs_free(kvlist);
-	return 0;
-}
-
 static int
 read_vf_msg_config(__rte_unused const char *key,
 			       const char *value,
@@ -1507,8 +1449,6 @@ eth_i40e_dev_init(struct rte_eth_dev *dev, void *init_params __rte_unused)
 	i40e_parse_vf_msg_config(dev, &pf->vf_msg_cfg);
 	/* Check if need to support multi-driver */
 	i40e_support_multi_driver(dev);
-	/* Check if users want the latest supported vec path */
-	i40e_use_latest_vec(dev);
 
 	/* Make sure all is clean before doing PF reset */
 	i40e_clear_hw(hw);
@@ -13010,5 +12950,4 @@ RTE_PMD_REGISTER_PARAM_STRING(net_i40e,
 			      ETH_I40E_FLOATING_VEB_ARG "=1"
 			      ETH_I40E_FLOATING_VEB_LIST_ARG "=<string>"
 			      ETH_I40E_QUEUE_NUM_PER_VF_ARG "=1|2|4|8|16"
-			      ETH_I40E_SUPPORT_MULTI_DRIVER "=1"
-			      ETH_I40E_USE_LATEST_VEC "=0|1");
+			      ETH_I40E_SUPPORT_MULTI_DRIVER "=1");
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 696c5aaf7e..70e6ba610b 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -1285,9 +1285,6 @@ struct i40e_adapter {
 	uint64_t flow_types_mask;
 	uint64_t pctypes_mask;
 
-	/* For devargs */
-	uint8_t use_latest_vec;
-
 	/* For RSS reta table update */
 	uint8_t rss_reta_updated;
 };
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 5df9a9df56..2910619fa5 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -3095,43 +3095,13 @@ i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 	qinfo->conf.offloads = txq->offloads;
 }
 
-static eth_rx_burst_t
-i40e_get_latest_rx_vec(bool scatter)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
-				 i40e_recv_pkts_vec_avx2;
-#endif
-	return scatter ? i40e_recv_scattered_pkts_vec :
-			 i40e_recv_pkts_vec;
-}
-
-static eth_rx_burst_t
-i40e_get_recommend_rx_vec(bool scatter)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	/*
-	 * since AVX frequency can be different to base frequency, limit
-	 * use of AVX2 version to later plaforms, not all those that could
-	 * theoretically run it.
-	 */
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
-				 i40e_recv_pkts_vec_avx2;
-#endif
-	return scatter ? i40e_recv_scattered_pkts_vec :
-			 i40e_recv_pkts_vec;
-}
-
 void __rte_cold
 i40e_set_rx_function(struct rte_eth_dev *dev)
 {
 	struct i40e_adapter *ad =
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	uint16_t rx_using_sse, i;
+	bool use_avx2 = false;
 	/* In order to allow Vector Rx there are a few configuration
 	 * conditions to be met and Rx Bulk Allocation should be allowed.
 	 */
@@ -3154,20 +3124,33 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 					break;
 				}
 			}
+
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 	}
 
 	if (ad->rx_vec_allowed  &&
 			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		/* Vec Rx path */
-		PMD_INIT_LOG(DEBUG, "Vector Rx path will be used on port=%d.",
+		if (dev->data->scattered_rx) {
+			PMD_INIT_LOG(DEBUG,
+				"Using %sVector Scattered Rx (port %d).",
+				use_avx2 ? "avx2 " : "",
 				dev->data->port_id);
-		if (ad->use_latest_vec)
-			dev->rx_pkt_burst =
-			i40e_get_latest_rx_vec(dev->data->scattered_rx);
-		else
-			dev->rx_pkt_burst =
-			i40e_get_recommend_rx_vec(dev->data->scattered_rx);
+			dev->rx_pkt_burst = use_avx2 ?
+				i40e_recv_scattered_pkts_vec_avx2 :
+				i40e_recv_scattered_pkts_vec;
+		} else {
+			PMD_INIT_LOG(DEBUG,
+				"Using %sVector Rx (port %d).",
+				use_avx2 ? "avx2 " : "",
+				dev->data->port_id);
+			dev->rx_pkt_burst = use_avx2 ?
+				i40e_recv_pkts_vec_avx2 :
+				i40e_recv_pkts_vec;
+		}
 	} else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are "
 				    "satisfied. Rx Burst Bulk Alloc function "
@@ -3268,39 +3251,13 @@ i40e_set_tx_function_flag(struct rte_eth_dev *dev, struct i40e_tx_queue *txq)
 				txq->queue_id);
 }
 
-static eth_tx_burst_t
-i40e_get_latest_tx_vec(void)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return i40e_xmit_pkts_vec_avx2;
-#endif
-	return i40e_xmit_pkts_vec;
-}
-
-static eth_tx_burst_t
-i40e_get_recommend_tx_vec(void)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	/*
-	 * since AVX frequency can be different to base frequency, limit
-	 * use of AVX2 version to later plaforms, not all those that could
-	 * theoretically run it.
-	 */
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return i40e_xmit_pkts_vec_avx2;
-#endif
-	return i40e_xmit_pkts_vec;
-}
-
 void __rte_cold
 i40e_set_tx_function(struct rte_eth_dev *dev)
 {
 	struct i40e_adapter *ad =
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int i;
+	bool use_avx2 = false;
 
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		if (ad->tx_vec_allowed) {
@@ -3313,19 +3270,23 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 					break;
 				}
 			}
+
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 	}
 
 	if (ad->tx_simple_allowed) {
 		if (ad->tx_vec_allowed &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-			PMD_INIT_LOG(DEBUG, "Vector tx finally be used.");
-			if (ad->use_latest_vec)
-				dev->tx_pkt_burst =
-					i40e_get_latest_tx_vec();
-			else
-				dev->tx_pkt_burst =
-					i40e_get_recommend_tx_vec();
+			PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				     use_avx2 ? "avx2 " : "",
+				     dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    i40e_xmit_pkts_vec_avx2 :
+					    i40e_xmit_pkts_vec;
 		} else {
 			PMD_INIT_LOG(DEBUG, "Simple tx finally be used.");
 			dev->tx_pkt_burst = i40e_xmit_pkts_simple;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH 2/3] net/i40e: add AVX512 vector path
  2020-12-15  2:19 [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e Leyi Rong
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
@ 2020-12-15  2:19 ` Leyi Rong
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Leyi Rong @ 2020-12-15  2:19 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

Add AVX512 support for i40e PMD. This patch adds i40e_rxtx_vec_avx512.c
to support i40e AVX512 vPMD.

This patch aims to enable AVX512 on i40e vPMD. Main changes are focus
on Rx path compared with AVX2 vPMD.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |  117 ++-
 drivers/net/i40e/i40e_rxtx.h            |    9 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1024 +++++++++++++++++++++++
 drivers/net/i40e/meson.build            |   24 +
 4 files changed, 1148 insertions(+), 26 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 2910619fa5..8357fb3ef8 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -1742,6 +1742,10 @@ i40e_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts ||
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec ||
 	    dev->rx_pkt_burst == i40e_recv_pkts_vec ||
+#ifdef CC_AVX512_SUPPORT
+	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx512 ||
+	    dev->rx_pkt_burst == i40e_recv_pkts_vec_avx512 ||
+#endif
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx2 ||
 	    dev->rx_pkt_burst == i40e_recv_pkts_vec_avx2)
 		return ptypes;
@@ -3102,6 +3106,7 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	uint16_t rx_using_sse, i;
 	bool use_avx2 = false;
+	bool use_avx512 = false;
 	/* In order to allow Vector Rx there are a few configuration
 	 * conditions to be met and Rx Bulk Allocation should be allowed.
 	 */
@@ -3125,9 +3130,19 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 				}
 			}
 
-			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
+#ifdef CC_AVX512_SUPPORT
+				use_avx512 = true;
+#else
+				PMD_DRV_LOG(NOTICE,
+					"AVX512 is not supported in build env");
+#endif
+			if (!use_avx512 &&
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
 				use_avx2 = true;
 		}
 	}
@@ -3135,21 +3150,41 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 	if (ad->rx_vec_allowed  &&
 			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		if (dev->data->scattered_rx) {
-			PMD_INIT_LOG(DEBUG,
-				"Using %sVector Scattered Rx (port %d).",
-				use_avx2 ? "avx2 " : "",
-				dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-				i40e_recv_scattered_pkts_vec_avx2 :
-				i40e_recv_scattered_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE,
+					"Using AVX512 Vector Scattered Rx (port %d).",
+					dev->data->port_id);
+				dev->rx_pkt_burst =
+					i40e_recv_scattered_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG,
+					"Using %sVector Scattered Rx (port %d).",
+					use_avx2 ? "avx2 " : "",
+					dev->data->port_id);
+				dev->rx_pkt_burst = use_avx2 ?
+					i40e_recv_scattered_pkts_vec_avx2 :
+					i40e_recv_scattered_pkts_vec;
+			}
 		} else {
-			PMD_INIT_LOG(DEBUG,
-				"Using %sVector Rx (port %d).",
-				use_avx2 ? "avx2 " : "",
-				dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-				i40e_recv_pkts_vec_avx2 :
-				i40e_recv_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE,
+					"Using AVX512 Vector Rx (port %d).",
+					dev->data->port_id);
+				dev->rx_pkt_burst =
+					i40e_recv_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG,
+					"Using %sVector Rx (port %d).",
+					use_avx2 ? "avx2 " : "",
+					dev->data->port_id);
+				dev->rx_pkt_burst = use_avx2 ?
+					i40e_recv_pkts_vec_avx2 :
+					i40e_recv_pkts_vec;
+			}
 		}
 	} else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are "
@@ -3172,6 +3207,10 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 		rx_using_sse =
 			(dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec ||
 			 dev->rx_pkt_burst == i40e_recv_pkts_vec ||
+#ifdef CC_AVX512_SUPPORT
+			 dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx512 ||
+			 dev->rx_pkt_burst == i40e_recv_pkts_vec_avx512 ||
+#endif
 			 dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx2 ||
 			 dev->rx_pkt_burst == i40e_recv_pkts_vec_avx2);
 
@@ -3192,6 +3231,10 @@ static const struct {
 	{ i40e_recv_pkts_bulk_alloc,         "Scalar Bulk Alloc" },
 	{ i40e_recv_pkts,                    "Scalar" },
 #ifdef RTE_ARCH_X86
+#ifdef CC_AVX512_SUPPORT
+	{ i40e_recv_scattered_pkts_vec_avx512, "Vector AVX512 Scattered" },
+	{ i40e_recv_pkts_vec_avx512,           "Vector AVX512" },
+#endif
 	{ i40e_recv_scattered_pkts_vec_avx2, "Vector AVX2 Scattered" },
 	{ i40e_recv_pkts_vec_avx2,           "Vector AVX2" },
 	{ i40e_recv_scattered_pkts_vec,      "Vector SSE Scattered" },
@@ -3258,6 +3301,7 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int i;
 	bool use_avx2 = false;
+	bool use_avx512 = false;
 
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		if (ad->tx_vec_allowed) {
@@ -3271,9 +3315,19 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 				}
 			}
 
-			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
+#ifdef CC_AVX512_SUPPORT
+				use_avx512 = true;
+#else
+			PMD_DRV_LOG(NOTICE,
+				"AVX512 is not supported in build env");
+#endif
+			if (!use_avx512 &&
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
 				use_avx2 = true;
 		}
 	}
@@ -3281,12 +3335,20 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 	if (ad->tx_simple_allowed) {
 		if (ad->tx_vec_allowed &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-			PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
-				     use_avx2 ? "avx2 " : "",
-				     dev->data->port_id);
-			dev->tx_pkt_burst = use_avx2 ?
-					    i40e_xmit_pkts_vec_avx2 :
-					    i40e_xmit_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = i40e_xmit_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
+					     use_avx2 ? "avx2 " : "",
+					     dev->data->port_id);
+				dev->tx_pkt_burst = use_avx2 ?
+						    i40e_xmit_pkts_vec_avx2 :
+						    i40e_xmit_pkts_vec;
+			}
 		} else {
 			PMD_INIT_LOG(DEBUG, "Simple tx finally be used.");
 			dev->tx_pkt_burst = i40e_xmit_pkts_simple;
@@ -3306,6 +3368,9 @@ static const struct {
 	{ i40e_xmit_pkts_simple,   "Scalar Simple" },
 	{ i40e_xmit_pkts,          "Scalar" },
 #ifdef RTE_ARCH_X86
+#ifdef CC_AVX512_SUPPORT
+	{ i40e_xmit_pkts_vec_avx512, "Vector AVX512" },
+#endif
 	{ i40e_xmit_pkts_vec_avx2, "Vector AVX2" },
 	{ i40e_xmit_pkts_vec,      "Vector SSE" },
 #elif defined(RTE_ARCH_ARM64)
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 57d7b4160b..2e3e50eb79 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -248,6 +248,15 @@ uint16_t i40e_recv_scattered_pkts_vec_avx2(void *rx_queue,
 	struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
 uint16_t i40e_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_pkts);
+uint16_t i40e_recv_pkts_vec_avx512(void *rx_queue,
+				   struct rte_mbuf **rx_pkts,
+				   uint16_t nb_pkts);
+uint16_t i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts);
+uint16_t i40e_xmit_pkts_vec_avx512(void *tx_queue,
+				   struct rte_mbuf **tx_pkts,
+				   uint16_t nb_pkts);
 
 /* For each value it means, datasheet of hardware can tell more details
  *
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
new file mode 100644
index 0000000000..ccddc3e2d4
--- /dev/null
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -0,0 +1,1024 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <rte_ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "base/i40e_prototype.h"
+#include "base/i40e_type.h"
+#include "i40e_ethdev.h"
+#include "i40e_rxtx.h"
+#include "i40e_rxtx_vec_common.h"
+
+#include <x86intrin.h>
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+#define RTE_I40E_DESCS_PER_LOOP_AVX 8
+
+static inline void
+i40e_rxq_rearm(struct i40e_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
+			rte_lcore_id());
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* We need to pull 'n' more MBUFs into the software ring from mempool
+	 * We inline the mempool function here, so we can vectorize the copy
+	 * from the cache into the shadow ring.
+	 */
+
+	if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
+		/* No. Backfill the cache first, and then fill from it */
+		uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
+				cache->len);
+
+		/* How many do we require
+		 * i.e. number to fill the cache + the request
+		 */
+		int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
+				&cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+					rxq->nb_rx_desc) {
+				__m128i dma_addr0;
+
+				dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+					rxep[i].mbuf = &rxq->fake_mbuf;
+					_mm_store_si128
+						((__m128i *)&rxdp[i].read,
+							dma_addr0);
+				}
+			}
+			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+					RTE_I40E_RXQ_REARM_THRESH;
+			return;
+		}
+	}
+
+	const __m512i iova_offsets =  _mm512_set1_epi64
+		(offsetof(struct rte_mbuf, buf_iova));
+	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+	/* to shuffle the addresses to correct slots. Values 4-7 will contain
+	 * zeros, so use 7 for a zero-value.
+	 */
+	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
+#else
+	const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
+#endif
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+	 * from mempool cache and populating both shadow and HW rings
+	 */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
+		const __m512i mbuf_ptrs = _mm512_loadu_si512
+			(&cache->objs[cache->len - 8]);
+		_mm512_store_si512(rxep, mbuf_ptrs);
+
+		/* gather iova of mbuf0-7 into one zmm reg */
+		const __m512i iova_base_addrs = _mm512_i64gather_epi64
+			(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+				0, /* base */
+				1 /* scale */);
+		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+				headroom);
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+		const __m512i iovas0 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 0));
+		const __m512i iovas1 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 1));
+
+		/* permute leaves desc 2-3 addresses in header address slots 0-1
+		 * but these are ignored by driver since header split not
+		 * enabled. Similarly for desc 4 & 5.
+		 */
+		const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
+			(permute_idx, iovas0);
+		const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
+
+		const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
+			(permute_idx, iovas1);
+		const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_1);
+		_mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
+		_mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
+#else
+		/* permute leaves desc 4-7 addresses in header address slots 0-3
+		 * but these are ignored by driver since header split not
+		 * enabled.
+		 */
+		const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
+			(permute_idx, iova_addrs);
+		const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
+#endif
+		rxep += 8, rxdp += 8, cache->len -= 8;
+	}
+
+	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* Handles 32B descriptor FDIR ID processing:
+ * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
+ * rx_pkts: required to store metadata back to mbufs
+ * pkt_idx: offset into the burst, increments in vector widths
+ * desc_idx: required to select the correct shift at compile time
+ */
+static inline __m256i
+desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
+			 struct rte_mbuf **rx_pkts,
+			 const uint32_t pkt_idx,
+			 const uint32_t desc_idx)
+{
+	/* 32B desc path: load rxdp.wb.qword2 for EXT_STATUS and FLEXBH_STAT */
+	__m128i *rxdp_desc_0 = (void *)(&rxdp[desc_idx + 0].wb.qword2);
+	__m128i *rxdp_desc_1 = (void *)(&rxdp[desc_idx + 1].wb.qword2);
+	const __m128i desc_qw2_0 = _mm_load_si128(rxdp_desc_0);
+	const __m128i desc_qw2_1 = _mm_load_si128(rxdp_desc_1);
+
+	/* Mask for FLEXBH_STAT, and the FDIR_ID value to compare against. The
+	 * remaining data is set to all 1's to pass through data.
+	 */
+	const __m256i flexbh_mask = _mm256_set_epi32(-1, -1, -1, 3 << 4,
+						     -1, -1, -1, 3 << 4);
+	const __m256i flexbh_id   = _mm256_set_epi32(-1, -1, -1, 1 << 4,
+						     -1, -1, -1, 1 << 4);
+
+	/* Load descriptor, check for FLEXBH bits, generate a mask for both
+	 * packets in the register.
+	 */
+	__m256i desc_qw2_0_1 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(desc_qw2_0),
+					desc_qw2_1, 1);
+	__m256i desc_tmp_msk = _mm256_and_si256(flexbh_mask, desc_qw2_0_1);
+	__m256i fdir_mask = _mm256_cmpeq_epi32(flexbh_id, desc_tmp_msk);
+	__m256i fdir_data = _mm256_alignr_epi8(desc_qw2_0_1, desc_qw2_0_1, 12);
+	__m256i desc_fdir_data = _mm256_and_si256(fdir_mask, fdir_data);
+
+	/* Write data out to the mbuf. There is no store to this area of the
+	 * mbuf today, so we cannot combine it with another store.
+	 */
+	const uint32_t idx_0 = pkt_idx + desc_idx;
+	const uint32_t idx_1 = pkt_idx + desc_idx + 1;
+
+	rx_pkts[idx_0]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 0);
+	rx_pkts[idx_1]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 4);
+
+	/* Create mbuf flags as required for mbuf_flags layout
+	 *  (That's high lane [1,3,5,7, 0,2,4,6] as u32 lanes).
+	 * Approach:
+	 * - Mask away bits not required from the fdir_mask
+	 * - Leave the PKT_FDIR_ID bit (1 << 13)
+	 * - Position that bit correctly based on packet number
+	 * - OR in the resulting bit to mbuf_flags
+	 */
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+	__m256i mbuf_flag_mask = _mm256_set_epi32(0, 0, 0, 1 << 13,
+						  0, 0, 0, 1 << 13);
+	__m256i desc_flag_bit =  _mm256_and_si256(mbuf_flag_mask, fdir_mask);
+
+	/* For static-inline function, this will be stripped out
+	 * as the desc_idx is a hard-coded constant.
+	 */
+	switch (desc_idx) {
+	case 0:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  4);
+	case 2:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  8);
+	case 4:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 12);
+	case 6:
+		return desc_flag_bit;
+	default:
+		break;
+	}
+
+	/* NOT REACHED, see above switch returns */
+	return _mm256_setzero_si256();
+}
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+#define PKTLEN_SHIFT     10
+
+/* Force inline as some compilers will not inline by default. */
+static __rte_always_inline uint16_t
+_recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+			  uint16_t nb_pkts, uint8_t *split_packet)
+{
+	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+			0, rxq->mbuf_initializer);
+	struct i40e_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union i40e_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
+		i40e_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set4_epi32
+			(0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore non-length fields */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			I40E_RX_DESC_STATUS_EOF_SHIFT);
+
+	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set4_epi32
+			(/* rss hash parsed separately */
+			 /* octet 4~7, 32bits rss */
+			 7 << 24 | 6 << 16 | 5 << 8 | 4,
+			 /* octet 2~3, low 16 bits vlan_macip */
+			 /* octet 14~15, 16 bits data_len */
+			 3 << 24 | 2 << 16 | 15 << 8 | 14,
+			 /* skip hi 16 bits pkt_len, zero out */
+			 /* octet 14~15, 16 bits pkt_len */
+			 0xFFFF << 16 | 15 << 8 | 14,
+			 /* pkt_type set as unknown */
+			 0xFFFFFFFF
+			);
+	/* compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Status/Error flag masks */
+	/* mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication. Bits 3-5 of error
+	 * field (bits 22-24) are for IP/L4 checksum errors
+	 */
+	const __m256i flags_mask = _mm256_set1_epi32
+		((1 << 2) | (1 << 11) | (3 << 12) | (7 << 22));
+
+	/* data to be shuffled by result of flag mask. If VLAN bit is set,
+	 * (bit 2), then position 4 in this array will be used in the
+	 * destination
+	 */
+	const __m256i vlan_flags_shuf = _mm256_set_epi32
+		(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+		0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+
+	/* data to be shuffled by result of flag mask, shifted down 11.
+	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+	 * place.
+	 */
+	const __m256i rss_flags_shuf = _mm256_set_epi8
+		(0, 0, 0, 0, 0, 0, 0, 0,
+		PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+		0, 0, PKT_RX_FDIR, 0, /* end up 128-bits */
+		0, 0, 0, 0, 0, 0, 0, 0,
+		PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+		0, 0, PKT_RX_FDIR, 0);
+
+	/* data to be shuffled by the result of the flags mask shifted by 22
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8
+		(0, 0, 0, 0, 0, 0, 0, 0,
+		/* shift right 1 bit to make sure it not exceed 255 */
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+		 PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+		 PKT_RX_L4_CKSUM_BAD) >> 1,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+		(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+		PKT_RX_IP_CKSUM_BAD >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+		/* second 128-bits */
+		0, 0, 0, 0, 0, 0, 0, 0,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+		 PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+		 PKT_RX_L4_CKSUM_BAD) >> 1,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+		(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+		PKT_RX_IP_CKSUM_BAD >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+	const __m256i cksum_mask = _mm256_set1_epi32
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+		PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+		PKT_RX_EIP_CKSUM_BAD);
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+			i += RTE_I40E_DESCS_PER_LOOP_AVX,
+			rxdp += RTE_I40E_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				_mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256((void *)&rx_pkts[i + 4],
+				_mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
+
+		/* load in descriptors, in reverse order */
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc6_7 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc6),
+				 raw_desc7, 1);
+		raw_desc4_5 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc4),
+				 raw_desc5, 1);
+		raw_desc2_3 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc2),
+				 raw_desc3, 1);
+		raw_desc0_1 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc0),
+				 raw_desc1, 1);
+
+		raw_desc4_7 =
+			_mm512_inserti64x4
+				(_mm512_castsi256_si512(raw_desc4_5),
+				 raw_desc6_7, 1);
+		raw_desc0_3 =
+			_mm512_inserti64x4
+				(_mm512_castsi256_si512(raw_desc0_1),
+				 raw_desc2_3, 1);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < RTE_I40E_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/* convert descriptors 0-7 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len4_7 = _mm512_slli_epi32
+					(raw_desc4_7, PKTLEN_SHIFT);
+		const __m512i len0_3 = _mm512_slli_epi32
+					(raw_desc0_3, PKTLEN_SHIFT);
+		const __m512i desc4_7 = _mm512_mask_blend_epi16
+					(0x80808080, raw_desc4_7, len4_7);
+		const __m512i desc0_3 = _mm512_mask_blend_epi16
+					(0x80808080, raw_desc0_3, len0_3);
+		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
+		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
+
+		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+
+		/* to get packet types, shift 64-bit values down 30 bits
+		 * and so ptype is in lower 8-bits in each
+		 */
+		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
+		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
+		const __m256i ptypes6_7 =
+			_mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 =
+			_mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const __m256i ptypes2_3 =
+			_mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 =
+			_mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
+		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
+		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
+		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
+		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
+		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
+		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
+		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, ptype_tbl[ptype7],
+			 0, 0, 0, ptype_tbl[ptype6],
+			 0, 0, 0, ptype_tbl[ptype5],
+			 0, 0, 0, ptype_tbl[ptype4]);
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, ptype_tbl[ptype3],
+			 0, 0, 0, ptype_tbl[ptype2],
+			 0, 0, 0, ptype_tbl[ptype1],
+			 0, 0, 0, ptype_tbl[ptype0]);
+
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+
+		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(desc4_7, status_permute_msk, desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+		/* set vlan and rss flags */
+		const __m256i vlan_flags =
+			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+		const __m256i rss_fdir_bits = _mm256_srli_epi32(flag_bits, 11);
+		const __m256i rss_flags = _mm256_shuffle_epi8(rss_flags_shuf,
+							      rss_fdir_bits);
+
+		/* l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 22));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+				_mm256_or_si256(rss_flags, vlan_flags));
+
+		/* If the rxq has FDIR enabled, read and process the FDIR info
+		 * from the descriptor. This can cause more loads/stores, so is
+		 * not always performed. Branch over the code when not enabled.
+		 */
+		if (rxq->fdir_enabled) {
+#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+			/* 16B descriptor code path:
+			 * RSS and FDIR ID use the same offset in the desc, so
+			 * only one can be present at a time. The code below
+			 * identifies an FDIR ID match, and zeros the RSS value
+			 * in the mbuf on FDIR match to keep mbuf data clean.
+			 */
+#define FDIR_BLEND_MASK ((1 << 3) | (1 << 7))
+
+			/* Flags:
+			 * - Take flags, shift bits to null out
+			 * - CMPEQ with known FDIR ID, to get 0xFFFF or 0 mask
+			 * - Strip bits from mask, leaving 0 or 1 for FDIR ID
+			 * - Merge with mbuf_flags
+			 */
+			/* FLM = 1, FLTSTAT = 0b01, (FLM | FLTSTAT) == 3.
+			 * Shift left by 28 to avoid having to mask.
+			 */
+			const __m256i fdir =
+				_mm256_slli_epi32(rss_fdir_bits, 28);
+			const __m256i fdir_id = _mm256_set1_epi32(3 << 28);
+
+			/* As above, the fdir_mask to packet mapping is this:
+			 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+			 * Then OR FDIR flags to mbuf_flags on FDIR ID hit.
+			 */
+			RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+			const __m256i pkt_fdir_bit = _mm256_set1_epi32(1 << 13);
+			const __m256i fdir_mask =
+				_mm256_cmpeq_epi32(fdir, fdir_id);
+			__m256i fdir_bits =
+				_mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_bits);
+
+			/* Based on FDIR_MASK, clear the RSS or FDIR value.
+			 * The FDIR ID value is masked to zero if not a hit,
+			 * otherwise the mb0_1 register RSS field is zeroed.
+			 */
+			const __m256i fdir_zero_mask = _mm256_setzero_si256();
+			__m256i tmp0_1 = _mm256_blend_epi32(fdir_zero_mask,
+						fdir_mask, FDIR_BLEND_MASK);
+			__m256i fdir_mb0_1 = _mm256_and_si256(mb0_1, fdir_mask);
+
+			mb0_1 = _mm256_andnot_si256(tmp0_1, mb0_1);
+
+			/* Write to mbuf: no stores to combine with, so just a
+			 * scalar store to push data here.
+			 */
+			rx_pkts[i + 0]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb0_1, 3);
+			rx_pkts[i + 1]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb0_1, 7);
+
+			/* Same as above, only shift the fdir_mask to align
+			 * the packet FDIR mask with the FDIR_ID desc lane.
+			 */
+			__m256i tmp2_3 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 12);
+			__m256i fdir_mb2_3 = _mm256_and_si256(mb2_3, tmp2_3);
+
+			tmp2_3 = _mm256_blend_epi32(fdir_zero_mask, tmp2_3,
+						    FDIR_BLEND_MASK);
+			mb2_3 = _mm256_andnot_si256(tmp2_3, mb2_3);
+			rx_pkts[i + 2]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb2_3, 3);
+			rx_pkts[i + 3]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb2_3, 7);
+
+			__m256i tmp4_5 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 8);
+			__m256i fdir_mb4_5 = _mm256_and_si256(mb4_5, tmp4_5);
+
+			tmp4_5 = _mm256_blend_epi32(fdir_zero_mask, tmp4_5,
+						    FDIR_BLEND_MASK);
+			mb4_5 = _mm256_andnot_si256(tmp4_5, mb4_5);
+			rx_pkts[i + 4]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb4_5, 3);
+			rx_pkts[i + 5]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb4_5, 7);
+
+			__m256i tmp6_7 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 4);
+			__m256i fdir_mb6_7 = _mm256_and_si256(mb6_7, tmp6_7);
+
+			tmp6_7 = _mm256_blend_epi32(fdir_zero_mask, tmp6_7,
+						    FDIR_BLEND_MASK);
+			mb6_7 = _mm256_andnot_si256(tmp6_7, mb6_7);
+			rx_pkts[i + 6]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb6_7, 3);
+			rx_pkts[i + 7]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb6_7, 7);
+
+			/* End of 16B descriptor handling */
+#else
+			/* 32B descriptor FDIR ID mark handling. Returns bits
+			 * to be OR-ed into the mbuf olflags.
+			 */
+			__m256i fdir_add_flags;
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 0);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 2);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 4);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 6);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+			/* End 32B desc handling */
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+		} /* if() on FDIR enabled */
+
+		/* At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init data
+		 * so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend for
+		 * each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+				rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(mbuf_flags, 8), 0x04);
+		rearm4 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(mbuf_flags, 4), 0x04);
+		rearm2 = _mm256_blend_epi32
+			(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32
+			(mbuf_init, _mm256_srli_si256(mbuf_flags, 4), 0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm6);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm4);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags = _mm256_castsi128_si256
+			(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(odd_flags, 8), 0x04);
+		rearm5 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(odd_flags, 4), 0x04);
+		rearm3 = _mm256_blend_epi32
+			(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32
+			(mbuf_init, _mm256_srli_si256(odd_flags, 4), 0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm7);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm5);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16
+				(1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
+			const __m256i eop_bits256 =
+				_mm256_and_si256(status0_7, eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+				(_mm256_castsi256_si128(eop_bits256),
+				_mm256_extractf128_si256(eop_bits256, 1));
+			/* flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+					eop_mask);
+			/* eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle = _mm_set_epi8
+				(0xFF, 0xFF, 0xFF, 0xFF, /* zero hi 64b */
+				0xFF, 0xFF, 0xFF, 0xFF,
+				8, 0, 10, 2, /* move values to lo 64b */
+				12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += RTE_I40E_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32
+			(status0_7, _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_extracti128_si256
+						(status0_7, 1)));
+		burst += __builtin_popcountll(_mm_cvtsi128_si64
+				(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != RTE_I40E_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+			  uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec_avx512(void *rx_queue,
+				     struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts)
+{
+	struct i40e_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
+			split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+		&split_flags[i]);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
+				    struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst = i40e_recv_scattered_burst_vec_avx512(rx_queue,
+				rx_pkts + retval, RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+	return retval + i40e_recv_scattered_burst_vec_avx512(rx_queue,
+				rx_pkts + retval, nb_pkts);
+}
+
+static inline void
+vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
+		((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+				pkt->buf_iova + pkt->data_off);
+	_mm_store_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+vtx(volatile struct i40e_tx_desc *txdp,
+	struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
+			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		vtx1(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do two at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+
+		__m256i desc2_3 = _mm256_set_epi64x
+			(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
+		__m256i desc0_1 = _mm256_set_epi64x
+			(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm256_store_si256((void *)(txdp + 2), desc2_3);
+		_mm256_store_si256((void *)txdp, desc0_1);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		vtx1(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static inline uint16_t
+i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+				 uint16_t nb_pkts)
+{
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+	volatile struct i40e_tx_desc *txdp;
+	struct i40e_tx_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = I40E_TD_CMD;
+	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		i40e_tx_free_bufs(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry(txep, tx_pkts, n);
+
+		vtx(txdp, tx_pkts, n - 1, flags);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = &txq->sw_ring[tx_id];
+	}
+
+	tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
+						I40E_TXD_QW1_CMD_SHIFT);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+i40e_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = i40e_xmit_fixed_burst_vec_avx512
+				(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index bb0c542a30..7e84410a08 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -44,6 +44,30 @@ if arch_subdir == 'x86'
 				c_args: [cflags, '-mavx2'])
 		objs += i40e_avx2_lib.extract_objects('i40e_rxtx_vec_avx2.c')
 	endif
+
+	i40e_avx512_cpu_support = (
+		cc.get_define('__AVX512F__', args: machine_args) != '' and
+		cc.get_define('__AVX512BW__', args: machine_args) != '')
+
+	i40e_avx512_cc_support = (
+		not machine_args.contains('-mno-avx512f') and
+		cc.has_argument('-mavx512f') and
+		cc.has_argument('-mavx512bw'))
+
+	if i40e_avx512_cpu_support == true or i40e_avx512_cc_support == true
+		cflags += ['-DCC_AVX512_SUPPORT']
+		avx512_args = [cflags, '-mavx512f', '-mavx512bw']
+		if cc.has_argument('-march=skylake-avx512')
+			avx512_args += '-march=skylake-avx512'
+		endif
+		i40e_avx512_lib = static_library('i40e_avx512_lib',
+				'i40e_rxtx_vec_avx512.c',
+				dependencies: [static_rte_ethdev,
+					static_rte_kvargs, static_rte_hash],
+				include_directories: includes,
+				c_args: avx512_args)
+		objs += i40e_avx512_lib.extract_objects('i40e_rxtx_vec_avx512.c')
+	endif
 elif arch_subdir == 'ppc'
        dpdk_conf.set('RTE_LIBRTE_I40E_INC_VECTOR', 1)
        sources += files('i40e_rxtx_vec_altivec.c')
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH 3/3] net/i40e: optimize Tx by using AVX512
  2020-12-15  2:19 [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e Leyi Rong
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 2/3] net/i40e: add AVX512 vector path Leyi Rong
@ 2020-12-15  2:19 ` Leyi Rong
  2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
  2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
  4 siblings, 0 replies; 42+ messages in thread
From: Leyi Rong @ 2020-12-15  2:19 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

Optimize Tx path by using AVX512 instructions and vectorize the
tx free bufs process.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |  19 +++
 drivers/net/i40e/i40e_rxtx.h            |   4 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 146 ++++++++++++++++++++----
 3 files changed, 149 insertions(+), 20 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 8357fb3ef8..96071c55fd 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2508,6 +2508,25 @@ i40e_tx_queue_release_mbufs(struct i40e_tx_queue *txq)
 	 *  vPMD tx will not set sw_ring's mbuf to NULL after free,
 	 *  so need to free remains more carefully.
 	 */
+#ifdef CC_AVX512_SUPPORT
+	if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx512) {
+		struct i40e_vec_tx_entry *swr = (void *)txq->sw_ring;
+
+		i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
+		if (txq->tx_tail < i) {
+			for (; i < txq->nb_tx_desc; i++) {
+				rte_pktmbuf_free_seg(swr[i].mbuf);
+				swr[i].mbuf = NULL;
+			}
+			i = 0;
+		}
+		for (; i < txq->tx_tail; i++) {
+			rte_pktmbuf_free_seg(swr[i].mbuf);
+			swr[i].mbuf = NULL;
+		}
+		return;
+	}
+#endif
 	if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx2 ||
 			dev->tx_pkt_burst == i40e_xmit_pkts_vec) {
 		i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 2e3e50eb79..2f55073c97 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -129,6 +129,10 @@ struct i40e_tx_entry {
 	uint16_t last_id;
 };
 
+struct i40e_vec_tx_entry {
+	struct rte_mbuf *mbuf;
+};
+
 /*
  * Structure associated with each TX queue.
  */
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index ccddc3e2d4..cb4083fa46 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -873,6 +873,109 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
 				rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline int
+i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
+{
+	struct i40e_vec_tx_entry *txep;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
+
+	/* check DD bits on threshold descriptor */
+	if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE))
+		return 0;
+
+	n = txq->tx_rs_thresh;
+
+	 /* first buffer to free from S/W ring is at index
+	  * tx_next_dd - (tx_rs_thresh-1)
+	  */
+	txep = (void *)txq->sw_ring;
+	txep += txq->tx_next_dd - (n - 1);
+
+	if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
+		struct rte_mempool *mp = txep[0].mbuf->pool;
+		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
+				rte_lcore_id());
+		void **cache_objs = &cache->objs[cache->len];
+
+		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+			goto done;
+		}
+
+		/* The cache follows the following algorithm
+		 *   1. Add the objects to the cache
+		 *   2. Anything greater than the cache min value (if it
+		 *   crosses the cache flush threshold) is flushed to the ring.
+		 */
+		/* Add elements back into the cache */
+		uint32_t copied = 0;
+		/* n is multiple of 32 */
+		while (copied < n) {
+			const __m512i a = _mm512_load_si512(&txep[copied]);
+			const __m512i b = _mm512_load_si512(&txep[copied + 8]);
+			const __m512i c = _mm512_load_si512(&txep[copied + 16]);
+			const __m512i d = _mm512_load_si512(&txep[copied + 24]);
+
+			_mm512_storeu_si512(&cache_objs[copied], a);
+			_mm512_storeu_si512(&cache_objs[copied + 8], b);
+			_mm512_storeu_si512(&cache_objs[copied + 16], c);
+			_mm512_storeu_si512(&cache_objs[copied + 24], d);
+			copied += 32;
+		}
+		cache->len += n;
+
+		if (cache->len >= cache->flushthresh) {
+			rte_mempool_ops_enqueue_bulk
+				(mp, &cache->objs[cache->size],
+				cache->len - cache->size);
+			cache->len = cache->size;
+		}
+		goto done;
+	}
+
+	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			rte_prefetch0(&txep[i + 3].mbuf->cacheline1);
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m)) {
+				if (likely(m->pool == free[0]->pool)) {
+					free[nb_free++] = m;
+				} else {
+					rte_mempool_put_bulk(free[0]->pool,
+							     (void *)free,
+							     nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+done:
+	/* buffers were freed, update counters */
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+	if (txq->tx_next_dd >= txq->nb_tx_desc)
+		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	return txq->tx_rs_thresh;
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
@@ -892,13 +995,6 @@ vtx(volatile struct i40e_tx_desc *txdp,
 	const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT));
 
-	/* if unaligned on 32-bit boundary, do one to align */
-	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		vtx1(txdp, *pkt, flags);
-		nb_pkts--, txdp++, pkt++;
-	}
-
-	/* do two at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
 		uint64_t hi_qw3 =
 			hi_qw_tmpl |
@@ -917,14 +1013,13 @@ vtx(volatile struct i40e_tx_desc *txdp,
 			((uint64_t)pkt[0]->data_len <<
 			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
 
-		__m256i desc2_3 = _mm256_set_epi64x
+		__m512i desc0_3 =
+			_mm512_set_epi64
 			(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
-			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
-		__m256i desc0_1 = _mm256_set_epi64x
-			(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off,
+			hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
 			hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
-		_mm256_store_si256((void *)(txdp + 2), desc2_3);
-		_mm256_store_si256((void *)txdp, desc0_1);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
@@ -934,13 +1029,23 @@ vtx(volatile struct i40e_tx_desc *txdp,
 	}
 }
 
+static __rte_always_inline void
+tx_backlog_entry_avx512(struct i40e_vec_tx_entry *txep,
+			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	int i;
+
+	for (i = 0; i < (int)nb_pkts; ++i)
+		txep[i].mbuf = tx_pkts[i];
+}
+
 static inline uint16_t
 i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				 uint16_t nb_pkts)
 {
 	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
 	volatile struct i40e_tx_desc *txdp;
-	struct i40e_tx_entry *txep;
+	struct i40e_vec_tx_entry *txep;
 	uint16_t n, nb_commit, tx_id;
 	uint64_t flags = I40E_TD_CMD;
 	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
@@ -949,7 +1054,7 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 
 	if (txq->nb_tx_free < txq->tx_free_thresh)
-		i40e_tx_free_bufs(txq);
+		i40e_tx_free_bufs_avx512(txq);
 
 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
 	if (unlikely(nb_pkts == 0))
@@ -957,13 +1062,14 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	tx_id = txq->tx_tail;
 	txdp = &txq->tx_ring[tx_id];
-	txep = &txq->sw_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
 
 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 
 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
 	if (nb_commit >= n) {
-		tx_backlog_entry(txep, tx_pkts, n);
+		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
 		vtx(txdp, tx_pkts, n - 1, flags);
 		tx_pkts += (n - 1);
@@ -977,11 +1083,11 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 
 		/* avoid reach the end of ring */
-		txdp = &txq->tx_ring[tx_id];
-		txep = &txq->sw_ring[tx_id];
+		txdp = txq->tx_ring;
+		txep = (void *)txq->sw_ring;
 	}
 
-	tx_backlog_entry(txep, tx_pkts, nb_commit);
+	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
 	vtx(txdp, tx_pkts, nb_commit, flags);
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e
  2020-12-15  2:19 [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e Leyi Rong
                   ` (2 preceding siblings ...)
  2020-12-15  2:19 ` [dpdk-dev] [PATCH 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
@ 2021-01-07  7:44 ` Leyi Rong
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
                     ` (3 more replies)
  2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
  4 siblings, 4 replies; 42+ messages in thread
From: Leyi Rong @ 2021-01-07  7:44 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

This patchset aims to support AVX512 vPMD on i40e.
And the changes are only target to AVX512 vector path.

---
v2:
- Add return value check on rte_mempool_default_cache().

Leyi Rong (3):
  net/i40e: remove devarg use-latest-supported-vec
  net/i40e: add AVX512 vector path
  net/i40e: optimize Tx by using AVX512

 doc/guides/nics/i40e.rst                |    9 -
 drivers/net/i40e/i40e_ethdev.c          |   63 +-
 drivers/net/i40e/i40e_ethdev.h          |    3 -
 drivers/net/i40e/i40e_rxtx.c            |  193 ++--
 drivers/net/i40e/i40e_rxtx.h            |   13 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136 +++++++++++++++++++++++
 drivers/net/i40e/meson.build            |   24 +
 7 files changed, 1293 insertions(+), 148 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec
  2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
@ 2021-01-07  7:44   ` Leyi Rong
  2021-01-13  6:12     ` Lu, Wenzhuo
  2021-01-13 13:40     ` Ferruh Yigit
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path Leyi Rong
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 42+ messages in thread
From: Leyi Rong @ 2021-01-07  7:44 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

As eal parameter --force-max-simd-bitwidth is already introduced,
to make it more clear when setting rx/tx function, remove
devarg use-latest-supported-vec support.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
---
 doc/guides/nics/i40e.rst       |   9 ---
 drivers/net/i40e/i40e_ethdev.c |  63 +------------------
 drivers/net/i40e/i40e_ethdev.h |   3 -
 drivers/net/i40e/i40e_rxtx.c   | 107 +++++++++++----------------------
 4 files changed, 35 insertions(+), 147 deletions(-)

diff --git a/doc/guides/nics/i40e.rst b/doc/guides/nics/i40e.rst
index 4e5c4679b8..90fb8a4d6f 100644
--- a/doc/guides/nics/i40e.rst
+++ b/doc/guides/nics/i40e.rst
@@ -209,15 +209,6 @@ Runtime Config Options
   Currently hot-plugging of representor ports is not supported so all required
   representors must be specified on the creation of the PF.
 
-- ``Use latest supported vector`` (default ``disable``)
-
-  Latest supported vector path may not always get the best perf so vector path was
-  recommended to use only on later platform. But users may want the latest vector path
-  since it can get better perf in some real work loading cases. So ``devargs`` param
-  ``use-latest-supported-vec`` is introduced, for example::
-
-  -a 84:00.0,use-latest-supported-vec=1
-
 - ``Enable validation for VF message`` (default ``not enabled``)
 
   The PF counts messages from each VF. If in any period of seconds the message
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index f54769c29d..223eb9950b 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -44,7 +44,6 @@
 #define ETH_I40E_FLOATING_VEB_LIST_ARG	"floating_veb_list"
 #define ETH_I40E_SUPPORT_MULTI_DRIVER	"support-multi-driver"
 #define ETH_I40E_QUEUE_NUM_PER_VF_ARG	"queue-num-per-vf"
-#define ETH_I40E_USE_LATEST_VEC	"use-latest-supported-vec"
 #define ETH_I40E_VF_MSG_CFG		"vf_msg_cfg"
 
 #define I40E_CLEAR_PXE_WAIT_MS     200
@@ -403,7 +402,6 @@ static const char *const valid_keys[] = {
 	ETH_I40E_FLOATING_VEB_LIST_ARG,
 	ETH_I40E_SUPPORT_MULTI_DRIVER,
 	ETH_I40E_QUEUE_NUM_PER_VF_ARG,
-	ETH_I40E_USE_LATEST_VEC,
 	ETH_I40E_VF_MSG_CFG,
 	NULL};
 
@@ -1301,62 +1299,6 @@ i40e_aq_debug_write_global_register(struct i40e_hw *hw,
 	return i40e_aq_debug_write_register(hw, reg_addr, reg_val, cmd_details);
 }
 
-static int
-i40e_parse_latest_vec_handler(__rte_unused const char *key,
-				const char *value,
-				void *opaque)
-{
-	struct i40e_adapter *ad = opaque;
-	int use_latest_vec;
-
-	use_latest_vec = atoi(value);
-
-	if (use_latest_vec != 0 && use_latest_vec != 1)
-		PMD_DRV_LOG(WARNING, "Value should be 0 or 1, set it as 1!");
-
-	ad->use_latest_vec = (uint8_t)use_latest_vec;
-
-	return 0;
-}
-
-static int
-i40e_use_latest_vec(struct rte_eth_dev *dev)
-{
-	struct i40e_adapter *ad =
-		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
-	struct rte_kvargs *kvlist;
-	int kvargs_count;
-
-	ad->use_latest_vec = false;
-
-	if (!dev->device->devargs)
-		return 0;
-
-	kvlist = rte_kvargs_parse(dev->device->devargs->args, valid_keys);
-	if (!kvlist)
-		return -EINVAL;
-
-	kvargs_count = rte_kvargs_count(kvlist, ETH_I40E_USE_LATEST_VEC);
-	if (!kvargs_count) {
-		rte_kvargs_free(kvlist);
-		return 0;
-	}
-
-	if (kvargs_count > 1)
-		PMD_DRV_LOG(WARNING, "More than one argument \"%s\" and only "
-			    "the first invalid or last valid one is used !",
-			    ETH_I40E_USE_LATEST_VEC);
-
-	if (rte_kvargs_process(kvlist, ETH_I40E_USE_LATEST_VEC,
-				i40e_parse_latest_vec_handler, ad) < 0) {
-		rte_kvargs_free(kvlist);
-		return -EINVAL;
-	}
-
-	rte_kvargs_free(kvlist);
-	return 0;
-}
-
 static int
 read_vf_msg_config(__rte_unused const char *key,
 			       const char *value,
@@ -1507,8 +1449,6 @@ eth_i40e_dev_init(struct rte_eth_dev *dev, void *init_params __rte_unused)
 	i40e_parse_vf_msg_config(dev, &pf->vf_msg_cfg);
 	/* Check if need to support multi-driver */
 	i40e_support_multi_driver(dev);
-	/* Check if users want the latest supported vec path */
-	i40e_use_latest_vec(dev);
 
 	/* Make sure all is clean before doing PF reset */
 	i40e_clear_hw(hw);
@@ -13010,5 +12950,4 @@ RTE_PMD_REGISTER_PARAM_STRING(net_i40e,
 			      ETH_I40E_FLOATING_VEB_ARG "=1"
 			      ETH_I40E_FLOATING_VEB_LIST_ARG "=<string>"
 			      ETH_I40E_QUEUE_NUM_PER_VF_ARG "=1|2|4|8|16"
-			      ETH_I40E_SUPPORT_MULTI_DRIVER "=1"
-			      ETH_I40E_USE_LATEST_VEC "=0|1");
+			      ETH_I40E_SUPPORT_MULTI_DRIVER "=1");
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 696c5aaf7e..70e6ba610b 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -1285,9 +1285,6 @@ struct i40e_adapter {
 	uint64_t flow_types_mask;
 	uint64_t pctypes_mask;
 
-	/* For devargs */
-	uint8_t use_latest_vec;
-
 	/* For RSS reta table update */
 	uint8_t rss_reta_updated;
 };
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 5df9a9df56..2910619fa5 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -3095,43 +3095,13 @@ i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 	qinfo->conf.offloads = txq->offloads;
 }
 
-static eth_rx_burst_t
-i40e_get_latest_rx_vec(bool scatter)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
-				 i40e_recv_pkts_vec_avx2;
-#endif
-	return scatter ? i40e_recv_scattered_pkts_vec :
-			 i40e_recv_pkts_vec;
-}
-
-static eth_rx_burst_t
-i40e_get_recommend_rx_vec(bool scatter)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	/*
-	 * since AVX frequency can be different to base frequency, limit
-	 * use of AVX2 version to later plaforms, not all those that could
-	 * theoretically run it.
-	 */
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
-				 i40e_recv_pkts_vec_avx2;
-#endif
-	return scatter ? i40e_recv_scattered_pkts_vec :
-			 i40e_recv_pkts_vec;
-}
-
 void __rte_cold
 i40e_set_rx_function(struct rte_eth_dev *dev)
 {
 	struct i40e_adapter *ad =
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	uint16_t rx_using_sse, i;
+	bool use_avx2 = false;
 	/* In order to allow Vector Rx there are a few configuration
 	 * conditions to be met and Rx Bulk Allocation should be allowed.
 	 */
@@ -3154,20 +3124,33 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 					break;
 				}
 			}
+
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 	}
 
 	if (ad->rx_vec_allowed  &&
 			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		/* Vec Rx path */
-		PMD_INIT_LOG(DEBUG, "Vector Rx path will be used on port=%d.",
+		if (dev->data->scattered_rx) {
+			PMD_INIT_LOG(DEBUG,
+				"Using %sVector Scattered Rx (port %d).",
+				use_avx2 ? "avx2 " : "",
 				dev->data->port_id);
-		if (ad->use_latest_vec)
-			dev->rx_pkt_burst =
-			i40e_get_latest_rx_vec(dev->data->scattered_rx);
-		else
-			dev->rx_pkt_burst =
-			i40e_get_recommend_rx_vec(dev->data->scattered_rx);
+			dev->rx_pkt_burst = use_avx2 ?
+				i40e_recv_scattered_pkts_vec_avx2 :
+				i40e_recv_scattered_pkts_vec;
+		} else {
+			PMD_INIT_LOG(DEBUG,
+				"Using %sVector Rx (port %d).",
+				use_avx2 ? "avx2 " : "",
+				dev->data->port_id);
+			dev->rx_pkt_burst = use_avx2 ?
+				i40e_recv_pkts_vec_avx2 :
+				i40e_recv_pkts_vec;
+		}
 	} else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are "
 				    "satisfied. Rx Burst Bulk Alloc function "
@@ -3268,39 +3251,13 @@ i40e_set_tx_function_flag(struct rte_eth_dev *dev, struct i40e_tx_queue *txq)
 				txq->queue_id);
 }
 
-static eth_tx_burst_t
-i40e_get_latest_tx_vec(void)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return i40e_xmit_pkts_vec_avx2;
-#endif
-	return i40e_xmit_pkts_vec;
-}
-
-static eth_tx_burst_t
-i40e_get_recommend_tx_vec(void)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	/*
-	 * since AVX frequency can be different to base frequency, limit
-	 * use of AVX2 version to later plaforms, not all those that could
-	 * theoretically run it.
-	 */
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return i40e_xmit_pkts_vec_avx2;
-#endif
-	return i40e_xmit_pkts_vec;
-}
-
 void __rte_cold
 i40e_set_tx_function(struct rte_eth_dev *dev)
 {
 	struct i40e_adapter *ad =
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int i;
+	bool use_avx2 = false;
 
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		if (ad->tx_vec_allowed) {
@@ -3313,19 +3270,23 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 					break;
 				}
 			}
+
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 	}
 
 	if (ad->tx_simple_allowed) {
 		if (ad->tx_vec_allowed &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-			PMD_INIT_LOG(DEBUG, "Vector tx finally be used.");
-			if (ad->use_latest_vec)
-				dev->tx_pkt_burst =
-					i40e_get_latest_tx_vec();
-			else
-				dev->tx_pkt_burst =
-					i40e_get_recommend_tx_vec();
+			PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				     use_avx2 ? "avx2 " : "",
+				     dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    i40e_xmit_pkts_vec_avx2 :
+					    i40e_xmit_pkts_vec;
 		} else {
 			PMD_INIT_LOG(DEBUG, "Simple tx finally be used.");
 			dev->tx_pkt_burst = i40e_xmit_pkts_simple;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path
  2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
@ 2021-01-07  7:44   ` Leyi Rong
  2021-01-13  6:13     ` Lu, Wenzhuo
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
  2021-01-13  9:53   ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Zhang, Qi Z
  3 siblings, 1 reply; 42+ messages in thread
From: Leyi Rong @ 2021-01-07  7:44 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

Add AVX512 support for i40e PMD. This patch adds i40e_rxtx_vec_avx512.c
to support i40e AVX512 vPMD.

This patch aims to enable AVX512 on i40e vPMD. Main changes are focus
on Rx path compared with AVX2 vPMD.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |  117 ++-
 drivers/net/i40e/i40e_rxtx.h            |    9 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1024 +++++++++++++++++++++++
 drivers/net/i40e/meson.build            |   24 +
 4 files changed, 1148 insertions(+), 26 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 2910619fa5..8357fb3ef8 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -1742,6 +1742,10 @@ i40e_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts ||
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec ||
 	    dev->rx_pkt_burst == i40e_recv_pkts_vec ||
+#ifdef CC_AVX512_SUPPORT
+	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx512 ||
+	    dev->rx_pkt_burst == i40e_recv_pkts_vec_avx512 ||
+#endif
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx2 ||
 	    dev->rx_pkt_burst == i40e_recv_pkts_vec_avx2)
 		return ptypes;
@@ -3102,6 +3106,7 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	uint16_t rx_using_sse, i;
 	bool use_avx2 = false;
+	bool use_avx512 = false;
 	/* In order to allow Vector Rx there are a few configuration
 	 * conditions to be met and Rx Bulk Allocation should be allowed.
 	 */
@@ -3125,9 +3130,19 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 				}
 			}
 
-			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
+#ifdef CC_AVX512_SUPPORT
+				use_avx512 = true;
+#else
+				PMD_DRV_LOG(NOTICE,
+					"AVX512 is not supported in build env");
+#endif
+			if (!use_avx512 &&
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
 				use_avx2 = true;
 		}
 	}
@@ -3135,21 +3150,41 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 	if (ad->rx_vec_allowed  &&
 			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		if (dev->data->scattered_rx) {
-			PMD_INIT_LOG(DEBUG,
-				"Using %sVector Scattered Rx (port %d).",
-				use_avx2 ? "avx2 " : "",
-				dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-				i40e_recv_scattered_pkts_vec_avx2 :
-				i40e_recv_scattered_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE,
+					"Using AVX512 Vector Scattered Rx (port %d).",
+					dev->data->port_id);
+				dev->rx_pkt_burst =
+					i40e_recv_scattered_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG,
+					"Using %sVector Scattered Rx (port %d).",
+					use_avx2 ? "avx2 " : "",
+					dev->data->port_id);
+				dev->rx_pkt_burst = use_avx2 ?
+					i40e_recv_scattered_pkts_vec_avx2 :
+					i40e_recv_scattered_pkts_vec;
+			}
 		} else {
-			PMD_INIT_LOG(DEBUG,
-				"Using %sVector Rx (port %d).",
-				use_avx2 ? "avx2 " : "",
-				dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-				i40e_recv_pkts_vec_avx2 :
-				i40e_recv_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE,
+					"Using AVX512 Vector Rx (port %d).",
+					dev->data->port_id);
+				dev->rx_pkt_burst =
+					i40e_recv_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG,
+					"Using %sVector Rx (port %d).",
+					use_avx2 ? "avx2 " : "",
+					dev->data->port_id);
+				dev->rx_pkt_burst = use_avx2 ?
+					i40e_recv_pkts_vec_avx2 :
+					i40e_recv_pkts_vec;
+			}
 		}
 	} else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are "
@@ -3172,6 +3207,10 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 		rx_using_sse =
 			(dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec ||
 			 dev->rx_pkt_burst == i40e_recv_pkts_vec ||
+#ifdef CC_AVX512_SUPPORT
+			 dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx512 ||
+			 dev->rx_pkt_burst == i40e_recv_pkts_vec_avx512 ||
+#endif
 			 dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx2 ||
 			 dev->rx_pkt_burst == i40e_recv_pkts_vec_avx2);
 
@@ -3192,6 +3231,10 @@ static const struct {
 	{ i40e_recv_pkts_bulk_alloc,         "Scalar Bulk Alloc" },
 	{ i40e_recv_pkts,                    "Scalar" },
 #ifdef RTE_ARCH_X86
+#ifdef CC_AVX512_SUPPORT
+	{ i40e_recv_scattered_pkts_vec_avx512, "Vector AVX512 Scattered" },
+	{ i40e_recv_pkts_vec_avx512,           "Vector AVX512" },
+#endif
 	{ i40e_recv_scattered_pkts_vec_avx2, "Vector AVX2 Scattered" },
 	{ i40e_recv_pkts_vec_avx2,           "Vector AVX2" },
 	{ i40e_recv_scattered_pkts_vec,      "Vector SSE Scattered" },
@@ -3258,6 +3301,7 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int i;
 	bool use_avx2 = false;
+	bool use_avx512 = false;
 
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		if (ad->tx_vec_allowed) {
@@ -3271,9 +3315,19 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 				}
 			}
 
-			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+			if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
+#ifdef CC_AVX512_SUPPORT
+				use_avx512 = true;
+#else
+			PMD_DRV_LOG(NOTICE,
+				"AVX512 is not supported in build env");
+#endif
+			if (!use_avx512 &&
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
 				use_avx2 = true;
 		}
 	}
@@ -3281,12 +3335,20 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 	if (ad->tx_simple_allowed) {
 		if (ad->tx_vec_allowed &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-			PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
-				     use_avx2 ? "avx2 " : "",
-				     dev->data->port_id);
-			dev->tx_pkt_burst = use_avx2 ?
-					    i40e_xmit_pkts_vec_avx2 :
-					    i40e_xmit_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = i40e_xmit_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
+					     use_avx2 ? "avx2 " : "",
+					     dev->data->port_id);
+				dev->tx_pkt_burst = use_avx2 ?
+						    i40e_xmit_pkts_vec_avx2 :
+						    i40e_xmit_pkts_vec;
+			}
 		} else {
 			PMD_INIT_LOG(DEBUG, "Simple tx finally be used.");
 			dev->tx_pkt_burst = i40e_xmit_pkts_simple;
@@ -3306,6 +3368,9 @@ static const struct {
 	{ i40e_xmit_pkts_simple,   "Scalar Simple" },
 	{ i40e_xmit_pkts,          "Scalar" },
 #ifdef RTE_ARCH_X86
+#ifdef CC_AVX512_SUPPORT
+	{ i40e_xmit_pkts_vec_avx512, "Vector AVX512" },
+#endif
 	{ i40e_xmit_pkts_vec_avx2, "Vector AVX2" },
 	{ i40e_xmit_pkts_vec,      "Vector SSE" },
 #elif defined(RTE_ARCH_ARM64)
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 57d7b4160b..2e3e50eb79 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -248,6 +248,15 @@ uint16_t i40e_recv_scattered_pkts_vec_avx2(void *rx_queue,
 	struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
 uint16_t i40e_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_pkts);
+uint16_t i40e_recv_pkts_vec_avx512(void *rx_queue,
+				   struct rte_mbuf **rx_pkts,
+				   uint16_t nb_pkts);
+uint16_t i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts);
+uint16_t i40e_xmit_pkts_vec_avx512(void *tx_queue,
+				   struct rte_mbuf **tx_pkts,
+				   uint16_t nb_pkts);
 
 /* For each value it means, datasheet of hardware can tell more details
  *
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
new file mode 100644
index 0000000000..ccddc3e2d4
--- /dev/null
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -0,0 +1,1024 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <rte_ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "base/i40e_prototype.h"
+#include "base/i40e_type.h"
+#include "i40e_ethdev.h"
+#include "i40e_rxtx.h"
+#include "i40e_rxtx_vec_common.h"
+
+#include <x86intrin.h>
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+#define RTE_I40E_DESCS_PER_LOOP_AVX 8
+
+static inline void
+i40e_rxq_rearm(struct i40e_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
+			rte_lcore_id());
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* We need to pull 'n' more MBUFs into the software ring from mempool
+	 * We inline the mempool function here, so we can vectorize the copy
+	 * from the cache into the shadow ring.
+	 */
+
+	if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
+		/* No. Backfill the cache first, and then fill from it */
+		uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
+				cache->len);
+
+		/* How many do we require
+		 * i.e. number to fill the cache + the request
+		 */
+		int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
+				&cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+					rxq->nb_rx_desc) {
+				__m128i dma_addr0;
+
+				dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+					rxep[i].mbuf = &rxq->fake_mbuf;
+					_mm_store_si128
+						((__m128i *)&rxdp[i].read,
+							dma_addr0);
+				}
+			}
+			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+					RTE_I40E_RXQ_REARM_THRESH;
+			return;
+		}
+	}
+
+	const __m512i iova_offsets =  _mm512_set1_epi64
+		(offsetof(struct rte_mbuf, buf_iova));
+	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+	/* to shuffle the addresses to correct slots. Values 4-7 will contain
+	 * zeros, so use 7 for a zero-value.
+	 */
+	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
+#else
+	const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
+#endif
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+	 * from mempool cache and populating both shadow and HW rings
+	 */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
+		const __m512i mbuf_ptrs = _mm512_loadu_si512
+			(&cache->objs[cache->len - 8]);
+		_mm512_store_si512(rxep, mbuf_ptrs);
+
+		/* gather iova of mbuf0-7 into one zmm reg */
+		const __m512i iova_base_addrs = _mm512_i64gather_epi64
+			(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+				0, /* base */
+				1 /* scale */);
+		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+				headroom);
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+		const __m512i iovas0 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 0));
+		const __m512i iovas1 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 1));
+
+		/* permute leaves desc 2-3 addresses in header address slots 0-1
+		 * but these are ignored by driver since header split not
+		 * enabled. Similarly for desc 4 & 5.
+		 */
+		const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
+			(permute_idx, iovas0);
+		const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
+
+		const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
+			(permute_idx, iovas1);
+		const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_1);
+		_mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
+		_mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
+#else
+		/* permute leaves desc 4-7 addresses in header address slots 0-3
+		 * but these are ignored by driver since header split not
+		 * enabled.
+		 */
+		const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
+			(permute_idx, iova_addrs);
+		const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
+#endif
+		rxep += 8, rxdp += 8, cache->len -= 8;
+	}
+
+	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* Handles 32B descriptor FDIR ID processing:
+ * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
+ * rx_pkts: required to store metadata back to mbufs
+ * pkt_idx: offset into the burst, increments in vector widths
+ * desc_idx: required to select the correct shift at compile time
+ */
+static inline __m256i
+desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
+			 struct rte_mbuf **rx_pkts,
+			 const uint32_t pkt_idx,
+			 const uint32_t desc_idx)
+{
+	/* 32B desc path: load rxdp.wb.qword2 for EXT_STATUS and FLEXBH_STAT */
+	__m128i *rxdp_desc_0 = (void *)(&rxdp[desc_idx + 0].wb.qword2);
+	__m128i *rxdp_desc_1 = (void *)(&rxdp[desc_idx + 1].wb.qword2);
+	const __m128i desc_qw2_0 = _mm_load_si128(rxdp_desc_0);
+	const __m128i desc_qw2_1 = _mm_load_si128(rxdp_desc_1);
+
+	/* Mask for FLEXBH_STAT, and the FDIR_ID value to compare against. The
+	 * remaining data is set to all 1's to pass through data.
+	 */
+	const __m256i flexbh_mask = _mm256_set_epi32(-1, -1, -1, 3 << 4,
+						     -1, -1, -1, 3 << 4);
+	const __m256i flexbh_id   = _mm256_set_epi32(-1, -1, -1, 1 << 4,
+						     -1, -1, -1, 1 << 4);
+
+	/* Load descriptor, check for FLEXBH bits, generate a mask for both
+	 * packets in the register.
+	 */
+	__m256i desc_qw2_0_1 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(desc_qw2_0),
+					desc_qw2_1, 1);
+	__m256i desc_tmp_msk = _mm256_and_si256(flexbh_mask, desc_qw2_0_1);
+	__m256i fdir_mask = _mm256_cmpeq_epi32(flexbh_id, desc_tmp_msk);
+	__m256i fdir_data = _mm256_alignr_epi8(desc_qw2_0_1, desc_qw2_0_1, 12);
+	__m256i desc_fdir_data = _mm256_and_si256(fdir_mask, fdir_data);
+
+	/* Write data out to the mbuf. There is no store to this area of the
+	 * mbuf today, so we cannot combine it with another store.
+	 */
+	const uint32_t idx_0 = pkt_idx + desc_idx;
+	const uint32_t idx_1 = pkt_idx + desc_idx + 1;
+
+	rx_pkts[idx_0]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 0);
+	rx_pkts[idx_1]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 4);
+
+	/* Create mbuf flags as required for mbuf_flags layout
+	 *  (That's high lane [1,3,5,7, 0,2,4,6] as u32 lanes).
+	 * Approach:
+	 * - Mask away bits not required from the fdir_mask
+	 * - Leave the PKT_FDIR_ID bit (1 << 13)
+	 * - Position that bit correctly based on packet number
+	 * - OR in the resulting bit to mbuf_flags
+	 */
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+	__m256i mbuf_flag_mask = _mm256_set_epi32(0, 0, 0, 1 << 13,
+						  0, 0, 0, 1 << 13);
+	__m256i desc_flag_bit =  _mm256_and_si256(mbuf_flag_mask, fdir_mask);
+
+	/* For static-inline function, this will be stripped out
+	 * as the desc_idx is a hard-coded constant.
+	 */
+	switch (desc_idx) {
+	case 0:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  4);
+	case 2:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  8);
+	case 4:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 12);
+	case 6:
+		return desc_flag_bit;
+	default:
+		break;
+	}
+
+	/* NOT REACHED, see above switch returns */
+	return _mm256_setzero_si256();
+}
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+#define PKTLEN_SHIFT     10
+
+/* Force inline as some compilers will not inline by default. */
+static __rte_always_inline uint16_t
+_recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+			  uint16_t nb_pkts, uint8_t *split_packet)
+{
+	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+			0, rxq->mbuf_initializer);
+	struct i40e_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union i40e_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
+		i40e_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set4_epi32
+			(0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore non-length fields */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			I40E_RX_DESC_STATUS_EOF_SHIFT);
+
+	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set4_epi32
+			(/* rss hash parsed separately */
+			 /* octet 4~7, 32bits rss */
+			 7 << 24 | 6 << 16 | 5 << 8 | 4,
+			 /* octet 2~3, low 16 bits vlan_macip */
+			 /* octet 14~15, 16 bits data_len */
+			 3 << 24 | 2 << 16 | 15 << 8 | 14,
+			 /* skip hi 16 bits pkt_len, zero out */
+			 /* octet 14~15, 16 bits pkt_len */
+			 0xFFFF << 16 | 15 << 8 | 14,
+			 /* pkt_type set as unknown */
+			 0xFFFFFFFF
+			);
+	/* compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Status/Error flag masks */
+	/* mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication. Bits 3-5 of error
+	 * field (bits 22-24) are for IP/L4 checksum errors
+	 */
+	const __m256i flags_mask = _mm256_set1_epi32
+		((1 << 2) | (1 << 11) | (3 << 12) | (7 << 22));
+
+	/* data to be shuffled by result of flag mask. If VLAN bit is set,
+	 * (bit 2), then position 4 in this array will be used in the
+	 * destination
+	 */
+	const __m256i vlan_flags_shuf = _mm256_set_epi32
+		(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+		0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+
+	/* data to be shuffled by result of flag mask, shifted down 11.
+	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+	 * place.
+	 */
+	const __m256i rss_flags_shuf = _mm256_set_epi8
+		(0, 0, 0, 0, 0, 0, 0, 0,
+		PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+		0, 0, PKT_RX_FDIR, 0, /* end up 128-bits */
+		0, 0, 0, 0, 0, 0, 0, 0,
+		PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+		0, 0, PKT_RX_FDIR, 0);
+
+	/* data to be shuffled by the result of the flags mask shifted by 22
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8
+		(0, 0, 0, 0, 0, 0, 0, 0,
+		/* shift right 1 bit to make sure it not exceed 255 */
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+		 PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+		 PKT_RX_L4_CKSUM_BAD) >> 1,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+		(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+		PKT_RX_IP_CKSUM_BAD >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+		/* second 128-bits */
+		0, 0, 0, 0, 0, 0, 0, 0,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+		 PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+		 PKT_RX_L4_CKSUM_BAD) >> 1,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+		(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+		PKT_RX_IP_CKSUM_BAD >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+	const __m256i cksum_mask = _mm256_set1_epi32
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+		PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+		PKT_RX_EIP_CKSUM_BAD);
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+			i += RTE_I40E_DESCS_PER_LOOP_AVX,
+			rxdp += RTE_I40E_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				_mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256((void *)&rx_pkts[i + 4],
+				_mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
+
+		/* load in descriptors, in reverse order */
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc6_7 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc6),
+				 raw_desc7, 1);
+		raw_desc4_5 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc4),
+				 raw_desc5, 1);
+		raw_desc2_3 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc2),
+				 raw_desc3, 1);
+		raw_desc0_1 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc0),
+				 raw_desc1, 1);
+
+		raw_desc4_7 =
+			_mm512_inserti64x4
+				(_mm512_castsi256_si512(raw_desc4_5),
+				 raw_desc6_7, 1);
+		raw_desc0_3 =
+			_mm512_inserti64x4
+				(_mm512_castsi256_si512(raw_desc0_1),
+				 raw_desc2_3, 1);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < RTE_I40E_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/* convert descriptors 0-7 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len4_7 = _mm512_slli_epi32
+					(raw_desc4_7, PKTLEN_SHIFT);
+		const __m512i len0_3 = _mm512_slli_epi32
+					(raw_desc0_3, PKTLEN_SHIFT);
+		const __m512i desc4_7 = _mm512_mask_blend_epi16
+					(0x80808080, raw_desc4_7, len4_7);
+		const __m512i desc0_3 = _mm512_mask_blend_epi16
+					(0x80808080, raw_desc0_3, len0_3);
+		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
+		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
+
+		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+
+		/* to get packet types, shift 64-bit values down 30 bits
+		 * and so ptype is in lower 8-bits in each
+		 */
+		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
+		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
+		const __m256i ptypes6_7 =
+			_mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 =
+			_mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const __m256i ptypes2_3 =
+			_mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 =
+			_mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
+		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
+		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
+		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
+		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
+		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
+		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
+		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, ptype_tbl[ptype7],
+			 0, 0, 0, ptype_tbl[ptype6],
+			 0, 0, 0, ptype_tbl[ptype5],
+			 0, 0, 0, ptype_tbl[ptype4]);
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, ptype_tbl[ptype3],
+			 0, 0, 0, ptype_tbl[ptype2],
+			 0, 0, 0, ptype_tbl[ptype1],
+			 0, 0, 0, ptype_tbl[ptype0]);
+
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+
+		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(desc4_7, status_permute_msk, desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+		/* set vlan and rss flags */
+		const __m256i vlan_flags =
+			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+		const __m256i rss_fdir_bits = _mm256_srli_epi32(flag_bits, 11);
+		const __m256i rss_flags = _mm256_shuffle_epi8(rss_flags_shuf,
+							      rss_fdir_bits);
+
+		/* l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 22));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+				_mm256_or_si256(rss_flags, vlan_flags));
+
+		/* If the rxq has FDIR enabled, read and process the FDIR info
+		 * from the descriptor. This can cause more loads/stores, so is
+		 * not always performed. Branch over the code when not enabled.
+		 */
+		if (rxq->fdir_enabled) {
+#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+			/* 16B descriptor code path:
+			 * RSS and FDIR ID use the same offset in the desc, so
+			 * only one can be present at a time. The code below
+			 * identifies an FDIR ID match, and zeros the RSS value
+			 * in the mbuf on FDIR match to keep mbuf data clean.
+			 */
+#define FDIR_BLEND_MASK ((1 << 3) | (1 << 7))
+
+			/* Flags:
+			 * - Take flags, shift bits to null out
+			 * - CMPEQ with known FDIR ID, to get 0xFFFF or 0 mask
+			 * - Strip bits from mask, leaving 0 or 1 for FDIR ID
+			 * - Merge with mbuf_flags
+			 */
+			/* FLM = 1, FLTSTAT = 0b01, (FLM | FLTSTAT) == 3.
+			 * Shift left by 28 to avoid having to mask.
+			 */
+			const __m256i fdir =
+				_mm256_slli_epi32(rss_fdir_bits, 28);
+			const __m256i fdir_id = _mm256_set1_epi32(3 << 28);
+
+			/* As above, the fdir_mask to packet mapping is this:
+			 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+			 * Then OR FDIR flags to mbuf_flags on FDIR ID hit.
+			 */
+			RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+			const __m256i pkt_fdir_bit = _mm256_set1_epi32(1 << 13);
+			const __m256i fdir_mask =
+				_mm256_cmpeq_epi32(fdir, fdir_id);
+			__m256i fdir_bits =
+				_mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_bits);
+
+			/* Based on FDIR_MASK, clear the RSS or FDIR value.
+			 * The FDIR ID value is masked to zero if not a hit,
+			 * otherwise the mb0_1 register RSS field is zeroed.
+			 */
+			const __m256i fdir_zero_mask = _mm256_setzero_si256();
+			__m256i tmp0_1 = _mm256_blend_epi32(fdir_zero_mask,
+						fdir_mask, FDIR_BLEND_MASK);
+			__m256i fdir_mb0_1 = _mm256_and_si256(mb0_1, fdir_mask);
+
+			mb0_1 = _mm256_andnot_si256(tmp0_1, mb0_1);
+
+			/* Write to mbuf: no stores to combine with, so just a
+			 * scalar store to push data here.
+			 */
+			rx_pkts[i + 0]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb0_1, 3);
+			rx_pkts[i + 1]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb0_1, 7);
+
+			/* Same as above, only shift the fdir_mask to align
+			 * the packet FDIR mask with the FDIR_ID desc lane.
+			 */
+			__m256i tmp2_3 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 12);
+			__m256i fdir_mb2_3 = _mm256_and_si256(mb2_3, tmp2_3);
+
+			tmp2_3 = _mm256_blend_epi32(fdir_zero_mask, tmp2_3,
+						    FDIR_BLEND_MASK);
+			mb2_3 = _mm256_andnot_si256(tmp2_3, mb2_3);
+			rx_pkts[i + 2]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb2_3, 3);
+			rx_pkts[i + 3]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb2_3, 7);
+
+			__m256i tmp4_5 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 8);
+			__m256i fdir_mb4_5 = _mm256_and_si256(mb4_5, tmp4_5);
+
+			tmp4_5 = _mm256_blend_epi32(fdir_zero_mask, tmp4_5,
+						    FDIR_BLEND_MASK);
+			mb4_5 = _mm256_andnot_si256(tmp4_5, mb4_5);
+			rx_pkts[i + 4]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb4_5, 3);
+			rx_pkts[i + 5]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb4_5, 7);
+
+			__m256i tmp6_7 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 4);
+			__m256i fdir_mb6_7 = _mm256_and_si256(mb6_7, tmp6_7);
+
+			tmp6_7 = _mm256_blend_epi32(fdir_zero_mask, tmp6_7,
+						    FDIR_BLEND_MASK);
+			mb6_7 = _mm256_andnot_si256(tmp6_7, mb6_7);
+			rx_pkts[i + 6]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb6_7, 3);
+			rx_pkts[i + 7]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb6_7, 7);
+
+			/* End of 16B descriptor handling */
+#else
+			/* 32B descriptor FDIR ID mark handling. Returns bits
+			 * to be OR-ed into the mbuf olflags.
+			 */
+			__m256i fdir_add_flags;
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 0);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 2);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 4);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 6);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+			/* End 32B desc handling */
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+		} /* if() on FDIR enabled */
+
+		/* At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init data
+		 * so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend for
+		 * each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+				rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(mbuf_flags, 8), 0x04);
+		rearm4 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(mbuf_flags, 4), 0x04);
+		rearm2 = _mm256_blend_epi32
+			(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32
+			(mbuf_init, _mm256_srli_si256(mbuf_flags, 4), 0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm6);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm4);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags = _mm256_castsi128_si256
+			(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(odd_flags, 8), 0x04);
+		rearm5 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(odd_flags, 4), 0x04);
+		rearm3 = _mm256_blend_epi32
+			(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32
+			(mbuf_init, _mm256_srli_si256(odd_flags, 4), 0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm7);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm5);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16
+				(1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
+			const __m256i eop_bits256 =
+				_mm256_and_si256(status0_7, eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+				(_mm256_castsi256_si128(eop_bits256),
+				_mm256_extractf128_si256(eop_bits256, 1));
+			/* flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+					eop_mask);
+			/* eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle = _mm_set_epi8
+				(0xFF, 0xFF, 0xFF, 0xFF, /* zero hi 64b */
+				0xFF, 0xFF, 0xFF, 0xFF,
+				8, 0, 10, 2, /* move values to lo 64b */
+				12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += RTE_I40E_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32
+			(status0_7, _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_extracti128_si256
+						(status0_7, 1)));
+		burst += __builtin_popcountll(_mm_cvtsi128_si64
+				(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != RTE_I40E_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+			  uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec_avx512(void *rx_queue,
+				     struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts)
+{
+	struct i40e_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
+			split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+		&split_flags[i]);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
+				    struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst = i40e_recv_scattered_burst_vec_avx512(rx_queue,
+				rx_pkts + retval, RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+	return retval + i40e_recv_scattered_burst_vec_avx512(rx_queue,
+				rx_pkts + retval, nb_pkts);
+}
+
+static inline void
+vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
+		((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+				pkt->buf_iova + pkt->data_off);
+	_mm_store_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+vtx(volatile struct i40e_tx_desc *txdp,
+	struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
+			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		vtx1(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do two at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+
+		__m256i desc2_3 = _mm256_set_epi64x
+			(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
+		__m256i desc0_1 = _mm256_set_epi64x
+			(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm256_store_si256((void *)(txdp + 2), desc2_3);
+		_mm256_store_si256((void *)txdp, desc0_1);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		vtx1(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static inline uint16_t
+i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+				 uint16_t nb_pkts)
+{
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+	volatile struct i40e_tx_desc *txdp;
+	struct i40e_tx_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = I40E_TD_CMD;
+	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		i40e_tx_free_bufs(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry(txep, tx_pkts, n);
+
+		vtx(txdp, tx_pkts, n - 1, flags);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = &txq->sw_ring[tx_id];
+	}
+
+	tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
+						I40E_TXD_QW1_CMD_SHIFT);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+i40e_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = i40e_xmit_fixed_burst_vec_avx512
+				(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index bb0c542a30..7e84410a08 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -44,6 +44,30 @@ if arch_subdir == 'x86'
 				c_args: [cflags, '-mavx2'])
 		objs += i40e_avx2_lib.extract_objects('i40e_rxtx_vec_avx2.c')
 	endif
+
+	i40e_avx512_cpu_support = (
+		cc.get_define('__AVX512F__', args: machine_args) != '' and
+		cc.get_define('__AVX512BW__', args: machine_args) != '')
+
+	i40e_avx512_cc_support = (
+		not machine_args.contains('-mno-avx512f') and
+		cc.has_argument('-mavx512f') and
+		cc.has_argument('-mavx512bw'))
+
+	if i40e_avx512_cpu_support == true or i40e_avx512_cc_support == true
+		cflags += ['-DCC_AVX512_SUPPORT']
+		avx512_args = [cflags, '-mavx512f', '-mavx512bw']
+		if cc.has_argument('-march=skylake-avx512')
+			avx512_args += '-march=skylake-avx512'
+		endif
+		i40e_avx512_lib = static_library('i40e_avx512_lib',
+				'i40e_rxtx_vec_avx512.c',
+				dependencies: [static_rte_ethdev,
+					static_rte_kvargs, static_rte_hash],
+				include_directories: includes,
+				c_args: avx512_args)
+		objs += i40e_avx512_lib.extract_objects('i40e_rxtx_vec_avx512.c')
+	endif
 elif arch_subdir == 'ppc'
        dpdk_conf.set('RTE_LIBRTE_I40E_INC_VECTOR', 1)
        sources += files('i40e_rxtx_vec_altivec.c')
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512
  2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path Leyi Rong
@ 2021-01-07  7:44   ` Leyi Rong
  2021-01-13  6:12     ` Lu, Wenzhuo
  2021-01-13  9:53   ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Zhang, Qi Z
  3 siblings, 1 reply; 42+ messages in thread
From: Leyi Rong @ 2021-01-07  7:44 UTC (permalink / raw)
  To: qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev, Leyi Rong

Optimize Tx path by using AVX512 instructions and vectorize the
tx free bufs process.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |  19 +++
 drivers/net/i40e/i40e_rxtx.h            |   4 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 152 ++++++++++++++++++++----
 3 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 8357fb3ef8..96071c55fd 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2508,6 +2508,25 @@ i40e_tx_queue_release_mbufs(struct i40e_tx_queue *txq)
 	 *  vPMD tx will not set sw_ring's mbuf to NULL after free,
 	 *  so need to free remains more carefully.
 	 */
+#ifdef CC_AVX512_SUPPORT
+	if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx512) {
+		struct i40e_vec_tx_entry *swr = (void *)txq->sw_ring;
+
+		i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
+		if (txq->tx_tail < i) {
+			for (; i < txq->nb_tx_desc; i++) {
+				rte_pktmbuf_free_seg(swr[i].mbuf);
+				swr[i].mbuf = NULL;
+			}
+			i = 0;
+		}
+		for (; i < txq->tx_tail; i++) {
+			rte_pktmbuf_free_seg(swr[i].mbuf);
+			swr[i].mbuf = NULL;
+		}
+		return;
+	}
+#endif
 	if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx2 ||
 			dev->tx_pkt_burst == i40e_xmit_pkts_vec) {
 		i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 2e3e50eb79..2f55073c97 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -129,6 +129,10 @@ struct i40e_tx_entry {
 	uint16_t last_id;
 };
 
+struct i40e_vec_tx_entry {
+	struct rte_mbuf *mbuf;
+};
+
 /*
  * Structure associated with each TX queue.
  */
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index ccddc3e2d4..43e939c605 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -873,6 +873,115 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
 				rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline int
+i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
+{
+	struct i40e_vec_tx_entry *txep;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
+
+	/* check DD bits on threshold descriptor */
+	if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE))
+		return 0;
+
+	n = txq->tx_rs_thresh;
+
+	 /* first buffer to free from S/W ring is at index
+	  * tx_next_dd - (tx_rs_thresh-1)
+	  */
+	txep = (void *)txq->sw_ring;
+	txep += txq->tx_next_dd - (n - 1);
+
+	if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
+		struct rte_mempool *mp = txep[0].mbuf->pool;
+		void **cache_objs;
+		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
+				rte_lcore_id());
+
+		if (!cache || cache->len == 0)
+			goto normal;
+
+		cache_objs = &cache->objs[cache->len];
+
+		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+			goto done;
+		}
+
+		/* The cache follows the following algorithm
+		 *   1. Add the objects to the cache
+		 *   2. Anything greater than the cache min value (if it
+		 *   crosses the cache flush threshold) is flushed to the ring.
+		 */
+		/* Add elements back into the cache */
+		uint32_t copied = 0;
+		/* n is multiple of 32 */
+		while (copied < n) {
+			const __m512i a = _mm512_load_si512(&txep[copied]);
+			const __m512i b = _mm512_load_si512(&txep[copied + 8]);
+			const __m512i c = _mm512_load_si512(&txep[copied + 16]);
+			const __m512i d = _mm512_load_si512(&txep[copied + 24]);
+
+			_mm512_storeu_si512(&cache_objs[copied], a);
+			_mm512_storeu_si512(&cache_objs[copied + 8], b);
+			_mm512_storeu_si512(&cache_objs[copied + 16], c);
+			_mm512_storeu_si512(&cache_objs[copied + 24], d);
+			copied += 32;
+		}
+		cache->len += n;
+
+		if (cache->len >= cache->flushthresh) {
+			rte_mempool_ops_enqueue_bulk
+				(mp, &cache->objs[cache->size],
+				cache->len - cache->size);
+			cache->len = cache->size;
+		}
+		goto done;
+	}
+
+normal:
+	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			rte_prefetch0(&txep[i + 3].mbuf->cacheline1);
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m)) {
+				if (likely(m->pool == free[0]->pool)) {
+					free[nb_free++] = m;
+				} else {
+					rte_mempool_put_bulk(free[0]->pool,
+							     (void *)free,
+							     nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+done:
+	/* buffers were freed, update counters */
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+	if (txq->tx_next_dd >= txq->nb_tx_desc)
+		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	return txq->tx_rs_thresh;
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
@@ -892,13 +1001,6 @@ vtx(volatile struct i40e_tx_desc *txdp,
 	const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT));
 
-	/* if unaligned on 32-bit boundary, do one to align */
-	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		vtx1(txdp, *pkt, flags);
-		nb_pkts--, txdp++, pkt++;
-	}
-
-	/* do two at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
 		uint64_t hi_qw3 =
 			hi_qw_tmpl |
@@ -917,14 +1019,13 @@ vtx(volatile struct i40e_tx_desc *txdp,
 			((uint64_t)pkt[0]->data_len <<
 			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
 
-		__m256i desc2_3 = _mm256_set_epi64x
+		__m512i desc0_3 =
+			_mm512_set_epi64
 			(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
-			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
-		__m256i desc0_1 = _mm256_set_epi64x
-			(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off,
+			hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
 			hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
-		_mm256_store_si256((void *)(txdp + 2), desc2_3);
-		_mm256_store_si256((void *)txdp, desc0_1);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
@@ -934,13 +1035,23 @@ vtx(volatile struct i40e_tx_desc *txdp,
 	}
 }
 
+static __rte_always_inline void
+tx_backlog_entry_avx512(struct i40e_vec_tx_entry *txep,
+			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	int i;
+
+	for (i = 0; i < (int)nb_pkts; ++i)
+		txep[i].mbuf = tx_pkts[i];
+}
+
 static inline uint16_t
 i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				 uint16_t nb_pkts)
 {
 	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
 	volatile struct i40e_tx_desc *txdp;
-	struct i40e_tx_entry *txep;
+	struct i40e_vec_tx_entry *txep;
 	uint16_t n, nb_commit, tx_id;
 	uint64_t flags = I40E_TD_CMD;
 	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
@@ -949,7 +1060,7 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 
 	if (txq->nb_tx_free < txq->tx_free_thresh)
-		i40e_tx_free_bufs(txq);
+		i40e_tx_free_bufs_avx512(txq);
 
 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
 	if (unlikely(nb_pkts == 0))
@@ -957,13 +1068,14 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	tx_id = txq->tx_tail;
 	txdp = &txq->tx_ring[tx_id];
-	txep = &txq->sw_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
 
 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 
 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
 	if (nb_commit >= n) {
-		tx_backlog_entry(txep, tx_pkts, n);
+		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
 		vtx(txdp, tx_pkts, n - 1, flags);
 		tx_pkts += (n - 1);
@@ -977,11 +1089,11 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 
 		/* avoid reach the end of ring */
-		txdp = &txq->tx_ring[tx_id];
-		txep = &txq->sw_ring[tx_id];
+		txdp = txq->tx_ring;
+		txep = (void *)txq->sw_ring;
 	}
 
-	tx_backlog_entry(txep, tx_pkts, nb_commit);
+	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
 	vtx(txdp, tx_pkts, nb_commit, flags);
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
@ 2021-01-13  6:12     ` Lu, Wenzhuo
  2021-01-13 13:40     ` Ferruh Yigit
  1 sibling, 0 replies; 42+ messages in thread
From: Lu, Wenzhuo @ 2021-01-13  6:12 UTC (permalink / raw)
  To: Rong, Leyi, Zhang, Qi Z, Richardson, Bruce, Xing, Beilei; +Cc: dev, Rong, Leyi


> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Leyi Rong
> Sent: Thursday, January 7, 2021 3:44 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; Rong, Leyi <leyi.rong@intel.com>
> Subject: [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-
> supported-vec
> 
> As eal parameter --force-max-simd-bitwidth is already introduced, to make it
> more clear when setting rx/tx function, remove devarg use-latest-supported-vec
> support.
> 
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Acked-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
@ 2021-01-13  6:12     ` Lu, Wenzhuo
  0 siblings, 0 replies; 42+ messages in thread
From: Lu, Wenzhuo @ 2021-01-13  6:12 UTC (permalink / raw)
  To: Rong, Leyi, Zhang, Qi Z, Richardson, Bruce, Xing, Beilei; +Cc: dev, Rong, Leyi

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Leyi Rong
> Sent: Thursday, January 7, 2021 3:44 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; Rong, Leyi <leyi.rong@intel.com>
> Subject: [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512
> 
> Optimize Tx path by using AVX512 instructions and vectorize the tx free bufs
> process.
> 
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path Leyi Rong
@ 2021-01-13  6:13     ` Lu, Wenzhuo
  0 siblings, 0 replies; 42+ messages in thread
From: Lu, Wenzhuo @ 2021-01-13  6:13 UTC (permalink / raw)
  To: Rong, Leyi, Zhang, Qi Z, Richardson, Bruce, Xing, Beilei; +Cc: dev, Rong, Leyi

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Leyi Rong
> Sent: Thursday, January 7, 2021 3:44 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; Rong, Leyi <leyi.rong@intel.com>
> Subject: [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path
> 
> Add AVX512 support for i40e PMD. This patch adds i40e_rxtx_vec_avx512.c to
> support i40e AVX512 vPMD.
> 
> This patch aims to enable AVX512 on i40e vPMD. Main changes are focus on Rx
> path compared with AVX2 vPMD.
> 
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e
  2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
                     ` (2 preceding siblings ...)
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
@ 2021-01-13  9:53   ` Zhang, Qi Z
  3 siblings, 0 replies; 42+ messages in thread
From: Zhang, Qi Z @ 2021-01-13  9:53 UTC (permalink / raw)
  To: Rong, Leyi, Richardson, Bruce, Xing, Beilei; +Cc: dev



> -----Original Message-----
> From: Rong, Leyi <leyi.rong@intel.com>
> Sent: Thursday, January 7, 2021 3:44 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; Rong, Leyi <leyi.rong@intel.com>
> Subject: [PATCH v2 0/3] AVX512 vPMD on i40e
> 
> This patchset aims to support AVX512 vPMD on i40e.
> And the changes are only target to AVX512 vector path.
> 
> ---
> v2:
> - Add return value check on rte_mempool_default_cache().
> 
> Leyi Rong (3):
>   net/i40e: remove devarg use-latest-supported-vec
>   net/i40e: add AVX512 vector path
>   net/i40e: optimize Tx by using AVX512
> 
>  doc/guides/nics/i40e.rst                |    9 -
>  drivers/net/i40e/i40e_ethdev.c          |   63 +-
>  drivers/net/i40e/i40e_ethdev.h          |    3 -
>  drivers/net/i40e/i40e_rxtx.c            |  193 ++--
>  drivers/net/i40e/i40e_rxtx.h            |   13 +
>  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136
> +++++++++++++++++++++++
>  drivers/net/i40e/meson.build            |   24 +
>  7 files changed, 1293 insertions(+), 148 deletions(-)  create mode 100644
> drivers/net/i40e/i40e_rxtx_vec_avx512.c
> 
> --
> 2.17.1

Applied to dpdk-next-net-intel.

Thanks
Qi


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec
  2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
  2021-01-13  6:12     ` Lu, Wenzhuo
@ 2021-01-13 13:40     ` Ferruh Yigit
  1 sibling, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-13 13:40 UTC (permalink / raw)
  To: Leyi Rong, qi.z.zhang, bruce.richardson, beilei.xing; +Cc: dev

On 1/7/2021 7:44 AM, Leyi Rong wrote:
> As eal parameter --force-max-simd-bitwidth is already introduced,
> to make it more clear when setting rx/tx function, remove
> devarg use-latest-supported-vec support.
> 
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>

<...>

> @@ -3154,20 +3124,33 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
>   					break;
>   				}
>   			}
> +
> +			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
> +			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
> +					rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
> +				use_avx2 = true;
>   		}

Hi Leyi,

The cpu flags, 'RTE_CPUFLAG_AVX2' & 'RTE_CPUFLAG_AVX512F', are only defined for 
x86 and causing build error for other architectures.

And what about extracting that logic into a static inline function, this makes 
code more clean, and in that function other architectures return 'use_avx2' 
false straightaway and can do the checks only for x86?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2020-12-15  2:19 [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e Leyi Rong
                   ` (3 preceding siblings ...)
  2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
@ 2021-01-14  6:39 ` Leyi Rong
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
                     ` (3 more replies)
  4 siblings, 4 replies; 42+ messages in thread
From: Leyi Rong @ 2021-01-14  6:39 UTC (permalink / raw)
  To: qi.z.zhang, wenzhuo.lu, ferruh.yigit, bruce.richardson, beilei.xing
  Cc: dev, Leyi Rong

This patchset aims to support AVX512 vPMD on i40e.
And the changes are only target to AVX512 vector path.

---
v3:
- Extract get_avx_supported() to get the proper vector data path to choose.

v2:
- Add return value check on rte_mempool_default_cache().


Leyi Rong (3):
  net/i40e: remove devarg use-latest-supported-vec
  net/i40e: add AVX512 vector path
  net/i40e: optimize Tx by using AVX512

 doc/guides/nics/i40e.rst                |    9 -
 drivers/net/i40e/i40e_ethdev.c          |   63 +-
 drivers/net/i40e/i40e_ethdev.h          |    3 -
 drivers/net/i40e/i40e_rxtx.c            |  196 ++--
 drivers/net/i40e/i40e_rxtx.h            |   13 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136 +++++++++++++++++++++++
 drivers/net/i40e/meson.build            |   24 +
 7 files changed, 1301 insertions(+), 143 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec
  2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
@ 2021-01-14  6:39   ` Leyi Rong
  2021-01-15 13:36     ` Ferruh Yigit
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 2/3] net/i40e: add AVX512 vector path Leyi Rong
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 42+ messages in thread
From: Leyi Rong @ 2021-01-14  6:39 UTC (permalink / raw)
  To: qi.z.zhang, wenzhuo.lu, ferruh.yigit, bruce.richardson, beilei.xing
  Cc: dev, Leyi Rong

As eal parameter --force-max-simd-bitwidth is already introduced,
to make it more clear when setting rx/tx function, remove
devarg use-latest-supported-vec support.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Acked-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/nics/i40e.rst       |   9 ---
 drivers/net/i40e/i40e_ethdev.c |  63 +----------------
 drivers/net/i40e/i40e_ethdev.h |   3 -
 drivers/net/i40e/i40e_rxtx.c   | 126 +++++++++++++++------------------
 4 files changed, 58 insertions(+), 143 deletions(-)

diff --git a/doc/guides/nics/i40e.rst b/doc/guides/nics/i40e.rst
index 64f20e7dab..20c998398c 100644
--- a/doc/guides/nics/i40e.rst
+++ b/doc/guides/nics/i40e.rst
@@ -209,15 +209,6 @@ Runtime Config Options
   Currently hot-plugging of representor ports is not supported so all required
   representors must be specified on the creation of the PF.
 
-- ``Use latest supported vector`` (default ``disable``)
-
-  Latest supported vector path may not always get the best perf so vector path was
-  recommended to use only on later platform. But users may want the latest vector path
-  since it can get better perf in some real work loading cases. So ``devargs`` param
-  ``use-latest-supported-vec`` is introduced, for example::
-
-  -a 84:00.0,use-latest-supported-vec=1
-
 - ``Enable validation for VF message`` (default ``not enabled``)
 
   The PF counts messages from each VF. If in any period of seconds the message
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 14622484a0..2854383fe9 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -45,7 +45,6 @@
 #define ETH_I40E_FLOATING_VEB_LIST_ARG	"floating_veb_list"
 #define ETH_I40E_SUPPORT_MULTI_DRIVER	"support-multi-driver"
 #define ETH_I40E_QUEUE_NUM_PER_VF_ARG	"queue-num-per-vf"
-#define ETH_I40E_USE_LATEST_VEC	"use-latest-supported-vec"
 #define ETH_I40E_VF_MSG_CFG		"vf_msg_cfg"
 
 #define I40E_CLEAR_PXE_WAIT_MS     200
@@ -403,7 +402,6 @@ static const char *const valid_keys[] = {
 	ETH_I40E_FLOATING_VEB_LIST_ARG,
 	ETH_I40E_SUPPORT_MULTI_DRIVER,
 	ETH_I40E_QUEUE_NUM_PER_VF_ARG,
-	ETH_I40E_USE_LATEST_VEC,
 	ETH_I40E_VF_MSG_CFG,
 	NULL};
 
@@ -1316,62 +1314,6 @@ i40e_aq_debug_write_global_register(struct i40e_hw *hw,
 	return i40e_aq_debug_write_register(hw, reg_addr, reg_val, cmd_details);
 }
 
-static int
-i40e_parse_latest_vec_handler(__rte_unused const char *key,
-				const char *value,
-				void *opaque)
-{
-	struct i40e_adapter *ad = opaque;
-	int use_latest_vec;
-
-	use_latest_vec = atoi(value);
-
-	if (use_latest_vec != 0 && use_latest_vec != 1)
-		PMD_DRV_LOG(WARNING, "Value should be 0 or 1, set it as 1!");
-
-	ad->use_latest_vec = (uint8_t)use_latest_vec;
-
-	return 0;
-}
-
-static int
-i40e_use_latest_vec(struct rte_eth_dev *dev)
-{
-	struct i40e_adapter *ad =
-		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
-	struct rte_kvargs *kvlist;
-	int kvargs_count;
-
-	ad->use_latest_vec = false;
-
-	if (!dev->device->devargs)
-		return 0;
-
-	kvlist = rte_kvargs_parse(dev->device->devargs->args, valid_keys);
-	if (!kvlist)
-		return -EINVAL;
-
-	kvargs_count = rte_kvargs_count(kvlist, ETH_I40E_USE_LATEST_VEC);
-	if (!kvargs_count) {
-		rte_kvargs_free(kvlist);
-		return 0;
-	}
-
-	if (kvargs_count > 1)
-		PMD_DRV_LOG(WARNING, "More than one argument \"%s\" and only "
-			    "the first invalid or last valid one is used !",
-			    ETH_I40E_USE_LATEST_VEC);
-
-	if (rte_kvargs_process(kvlist, ETH_I40E_USE_LATEST_VEC,
-				i40e_parse_latest_vec_handler, ad) < 0) {
-		rte_kvargs_free(kvlist);
-		return -EINVAL;
-	}
-
-	rte_kvargs_free(kvlist);
-	return 0;
-}
-
 static int
 read_vf_msg_config(__rte_unused const char *key,
 			       const char *value,
@@ -1522,8 +1464,6 @@ eth_i40e_dev_init(struct rte_eth_dev *dev, void *init_params __rte_unused)
 	i40e_parse_vf_msg_config(dev, &pf->vf_msg_cfg);
 	/* Check if need to support multi-driver */
 	i40e_support_multi_driver(dev);
-	/* Check if users want the latest supported vec path */
-	i40e_use_latest_vec(dev);
 
 	/* Make sure all is clean before doing PF reset */
 	i40e_clear_hw(hw);
@@ -12445,5 +12385,4 @@ RTE_PMD_REGISTER_PARAM_STRING(net_i40e,
 			      ETH_I40E_FLOATING_VEB_ARG "=1"
 			      ETH_I40E_FLOATING_VEB_LIST_ARG "=<string>"
 			      ETH_I40E_QUEUE_NUM_PER_VF_ARG "=1|2|4|8|16"
-			      ETH_I40E_SUPPORT_MULTI_DRIVER "=1"
-			      ETH_I40E_USE_LATEST_VEC "=0|1");
+			      ETH_I40E_SUPPORT_MULTI_DRIVER "=1");
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 0617fe5e65..cd484710b0 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -1309,9 +1309,6 @@ struct i40e_adapter {
 	uint64_t flow_types_mask;
 	uint64_t pctypes_mask;
 
-	/* For devargs */
-	uint8_t use_latest_vec;
-
 	/* For RSS reta table update */
 	uint8_t rss_reta_updated;
 };
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 5df9a9df56..a4661cdd83 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -3095,43 +3095,46 @@ i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 	qinfo->conf.offloads = txq->offloads;
 }
 
-static eth_rx_burst_t
-i40e_get_latest_rx_vec(bool scatter)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
-				 i40e_recv_pkts_vec_avx2;
-#endif
-	return scatter ? i40e_recv_scattered_pkts_vec :
-			 i40e_recv_pkts_vec;
-}
-
-static eth_rx_burst_t
-i40e_get_recommend_rx_vec(bool scatter)
+static inline bool
+get_avx_supported(bool request_avx512)
 {
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	/*
-	 * since AVX frequency can be different to base frequency, limit
-	 * use of AVX2 version to later plaforms, not all those that could
-	 * theoretically run it.
-	 */
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
-				 i40e_recv_pkts_vec_avx2;
+#ifdef RTE_ARCH_X86
+	if (request_avx512) {
+		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
+#ifdef CC_AVX512_SUPPORT
+			return true;
+#else
+		PMD_DRV_LOG(NOTICE,
+			"AVX512 is not supported in build env");
+		return false;
+#endif
+	} else {
+		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
+#ifdef CC_AVX2_SUPPORT
+			return true;
+#else
+		PMD_DRV_LOG(NOTICE,
+			"AVX2 is not supported in build env");
+		return false;
 #endif
-	return scatter ? i40e_recv_scattered_pkts_vec :
-			 i40e_recv_pkts_vec;
+	}
+#endif /* RTE_ARCH_X86 */
+
+	return false;
 }
 
+
 void __rte_cold
 i40e_set_rx_function(struct rte_eth_dev *dev)
 {
 	struct i40e_adapter *ad =
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	uint16_t rx_using_sse, i;
+	bool use_avx2 = false;
 	/* In order to allow Vector Rx there are a few configuration
 	 * conditions to be met and Rx Bulk Allocation should be allowed.
 	 */
@@ -3154,20 +3157,30 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 					break;
 				}
 			}
+
+			use_avx2 = get_avx_supported(0);
 		}
 	}
 
 	if (ad->rx_vec_allowed  &&
 			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		/* Vec Rx path */
-		PMD_INIT_LOG(DEBUG, "Vector Rx path will be used on port=%d.",
+		if (dev->data->scattered_rx) {
+			PMD_INIT_LOG(DEBUG,
+				"Using %sVector Scattered Rx (port %d).",
+				use_avx2 ? "avx2 " : "",
 				dev->data->port_id);
-		if (ad->use_latest_vec)
-			dev->rx_pkt_burst =
-			i40e_get_latest_rx_vec(dev->data->scattered_rx);
-		else
-			dev->rx_pkt_burst =
-			i40e_get_recommend_rx_vec(dev->data->scattered_rx);
+			dev->rx_pkt_burst = use_avx2 ?
+				i40e_recv_scattered_pkts_vec_avx2 :
+				i40e_recv_scattered_pkts_vec;
+		} else {
+			PMD_INIT_LOG(DEBUG,
+				"Using %sVector Rx (port %d).",
+				use_avx2 ? "avx2 " : "",
+				dev->data->port_id);
+			dev->rx_pkt_burst = use_avx2 ?
+				i40e_recv_pkts_vec_avx2 :
+				i40e_recv_pkts_vec;
+		}
 	} else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are "
 				    "satisfied. Rx Burst Bulk Alloc function "
@@ -3268,39 +3281,13 @@ i40e_set_tx_function_flag(struct rte_eth_dev *dev, struct i40e_tx_queue *txq)
 				txq->queue_id);
 }
 
-static eth_tx_burst_t
-i40e_get_latest_tx_vec(void)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return i40e_xmit_pkts_vec_avx2;
-#endif
-	return i40e_xmit_pkts_vec;
-}
-
-static eth_tx_burst_t
-i40e_get_recommend_tx_vec(void)
-{
-#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
-	/*
-	 * since AVX frequency can be different to base frequency, limit
-	 * use of AVX2 version to later plaforms, not all those that could
-	 * theoretically run it.
-	 */
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-		return i40e_xmit_pkts_vec_avx2;
-#endif
-	return i40e_xmit_pkts_vec;
-}
-
 void __rte_cold
 i40e_set_tx_function(struct rte_eth_dev *dev)
 {
 	struct i40e_adapter *ad =
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int i;
+	bool use_avx2 = false;
 
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		if (ad->tx_vec_allowed) {
@@ -3313,19 +3300,20 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 					break;
 				}
 			}
+
+			use_avx2 = get_avx_supported(0);
 		}
 	}
 
 	if (ad->tx_simple_allowed) {
 		if (ad->tx_vec_allowed &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-			PMD_INIT_LOG(DEBUG, "Vector tx finally be used.");
-			if (ad->use_latest_vec)
-				dev->tx_pkt_burst =
-					i40e_get_latest_tx_vec();
-			else
-				dev->tx_pkt_burst =
-					i40e_get_recommend_tx_vec();
+			PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				     use_avx2 ? "avx2 " : "",
+				     dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    i40e_xmit_pkts_vec_avx2 :
+					    i40e_xmit_pkts_vec;
 		} else {
 			PMD_INIT_LOG(DEBUG, "Simple tx finally be used.");
 			dev->tx_pkt_burst = i40e_xmit_pkts_simple;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v3 2/3] net/i40e: add AVX512 vector path
  2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
@ 2021-01-14  6:39   ` Leyi Rong
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
  2021-01-14  7:37   ` [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e Zhang, Qi Z
  3 siblings, 0 replies; 42+ messages in thread
From: Leyi Rong @ 2021-01-14  6:39 UTC (permalink / raw)
  To: qi.z.zhang, wenzhuo.lu, ferruh.yigit, bruce.richardson, beilei.xing
  Cc: dev, Leyi Rong

Add AVX512 support for i40e PMD. This patch adds i40e_rxtx_vec_avx512.c
to support i40e AVX512 vPMD.

This patch aims to enable AVX512 on i40e vPMD. Main changes are focus
on Rx path compared with AVX2 vPMD.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |   95 ++-
 drivers/net/i40e/i40e_rxtx.h            |    9 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1024 +++++++++++++++++++++++
 drivers/net/i40e/meson.build            |   24 +
 4 files changed, 1130 insertions(+), 22 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index a4661cdd83..c99c051306 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -1742,6 +1742,10 @@ i40e_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts ||
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec ||
 	    dev->rx_pkt_burst == i40e_recv_pkts_vec ||
+#ifdef CC_AVX512_SUPPORT
+	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx512 ||
+	    dev->rx_pkt_burst == i40e_recv_pkts_vec_avx512 ||
+#endif
 	    dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx2 ||
 	    dev->rx_pkt_burst == i40e_recv_pkts_vec_avx2)
 		return ptypes;
@@ -3135,6 +3139,7 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	uint16_t rx_using_sse, i;
 	bool use_avx2 = false;
+	bool use_avx512 = false;
 	/* In order to allow Vector Rx there are a few configuration
 	 * conditions to be met and Rx Bulk Allocation should be allowed.
 	 */
@@ -3158,28 +3163,51 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 				}
 			}
 
-			use_avx2 = get_avx_supported(0);
+			use_avx512 = get_avx_supported(1);
+
+			if (!use_avx512)
+				use_avx2 = get_avx_supported(0);
 		}
 	}
 
 	if (ad->rx_vec_allowed  &&
 			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		if (dev->data->scattered_rx) {
-			PMD_INIT_LOG(DEBUG,
-				"Using %sVector Scattered Rx (port %d).",
-				use_avx2 ? "avx2 " : "",
-				dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-				i40e_recv_scattered_pkts_vec_avx2 :
-				i40e_recv_scattered_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE,
+					"Using AVX512 Vector Scattered Rx (port %d).",
+					dev->data->port_id);
+				dev->rx_pkt_burst =
+					i40e_recv_scattered_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG,
+					"Using %sVector Scattered Rx (port %d).",
+					use_avx2 ? "avx2 " : "",
+					dev->data->port_id);
+				dev->rx_pkt_burst = use_avx2 ?
+					i40e_recv_scattered_pkts_vec_avx2 :
+					i40e_recv_scattered_pkts_vec;
+			}
 		} else {
-			PMD_INIT_LOG(DEBUG,
-				"Using %sVector Rx (port %d).",
-				use_avx2 ? "avx2 " : "",
-				dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-				i40e_recv_pkts_vec_avx2 :
-				i40e_recv_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE,
+					"Using AVX512 Vector Rx (port %d).",
+					dev->data->port_id);
+				dev->rx_pkt_burst =
+					i40e_recv_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG,
+					"Using %sVector Rx (port %d).",
+					use_avx2 ? "avx2 " : "",
+					dev->data->port_id);
+				dev->rx_pkt_burst = use_avx2 ?
+					i40e_recv_pkts_vec_avx2 :
+					i40e_recv_pkts_vec;
+			}
 		}
 	} else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are "
@@ -3202,6 +3230,10 @@ i40e_set_rx_function(struct rte_eth_dev *dev)
 		rx_using_sse =
 			(dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec ||
 			 dev->rx_pkt_burst == i40e_recv_pkts_vec ||
+#ifdef CC_AVX512_SUPPORT
+			 dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx512 ||
+			 dev->rx_pkt_burst == i40e_recv_pkts_vec_avx512 ||
+#endif
 			 dev->rx_pkt_burst == i40e_recv_scattered_pkts_vec_avx2 ||
 			 dev->rx_pkt_burst == i40e_recv_pkts_vec_avx2);
 
@@ -3222,6 +3254,10 @@ static const struct {
 	{ i40e_recv_pkts_bulk_alloc,         "Scalar Bulk Alloc" },
 	{ i40e_recv_pkts,                    "Scalar" },
 #ifdef RTE_ARCH_X86
+#ifdef CC_AVX512_SUPPORT
+	{ i40e_recv_scattered_pkts_vec_avx512, "Vector AVX512 Scattered" },
+	{ i40e_recv_pkts_vec_avx512,           "Vector AVX512" },
+#endif
 	{ i40e_recv_scattered_pkts_vec_avx2, "Vector AVX2 Scattered" },
 	{ i40e_recv_pkts_vec_avx2,           "Vector AVX2" },
 	{ i40e_recv_scattered_pkts_vec,      "Vector SSE Scattered" },
@@ -3288,6 +3324,7 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 		I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int i;
 	bool use_avx2 = false;
+	bool use_avx512 = false;
 
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		if (ad->tx_vec_allowed) {
@@ -3301,19 +3338,30 @@ i40e_set_tx_function(struct rte_eth_dev *dev)
 				}
 			}
 
-			use_avx2 = get_avx_supported(0);
+			use_avx512 = get_avx_supported(1);
+
+			if (!use_avx512)
+				use_avx2 = get_avx_supported(0);
 		}
 	}
 
 	if (ad->tx_simple_allowed) {
 		if (ad->tx_vec_allowed &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-			PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
-				     use_avx2 ? "avx2 " : "",
-				     dev->data->port_id);
-			dev->tx_pkt_burst = use_avx2 ?
-					    i40e_xmit_pkts_vec_avx2 :
-					    i40e_xmit_pkts_vec;
+			if (use_avx512) {
+#ifdef CC_AVX512_SUPPORT
+				PMD_DRV_LOG(NOTICE, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = i40e_xmit_pkts_vec_avx512;
+#endif
+			} else {
+				PMD_INIT_LOG(DEBUG, "Using %sVector Tx (port %d).",
+					     use_avx2 ? "avx2 " : "",
+					     dev->data->port_id);
+				dev->tx_pkt_burst = use_avx2 ?
+						    i40e_xmit_pkts_vec_avx2 :
+						    i40e_xmit_pkts_vec;
+			}
 		} else {
 			PMD_INIT_LOG(DEBUG, "Simple tx finally be used.");
 			dev->tx_pkt_burst = i40e_xmit_pkts_simple;
@@ -3333,6 +3381,9 @@ static const struct {
 	{ i40e_xmit_pkts_simple,   "Scalar Simple" },
 	{ i40e_xmit_pkts,          "Scalar" },
 #ifdef RTE_ARCH_X86
+#ifdef CC_AVX512_SUPPORT
+	{ i40e_xmit_pkts_vec_avx512, "Vector AVX512" },
+#endif
 	{ i40e_xmit_pkts_vec_avx2, "Vector AVX2" },
 	{ i40e_xmit_pkts_vec,      "Vector SSE" },
 #elif defined(RTE_ARCH_ARM64)
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 57d7b4160b..2e3e50eb79 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -248,6 +248,15 @@ uint16_t i40e_recv_scattered_pkts_vec_avx2(void *rx_queue,
 	struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
 uint16_t i40e_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_pkts);
+uint16_t i40e_recv_pkts_vec_avx512(void *rx_queue,
+				   struct rte_mbuf **rx_pkts,
+				   uint16_t nb_pkts);
+uint16_t i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
+					     struct rte_mbuf **rx_pkts,
+					     uint16_t nb_pkts);
+uint16_t i40e_xmit_pkts_vec_avx512(void *tx_queue,
+				   struct rte_mbuf **tx_pkts,
+				   uint16_t nb_pkts);
 
 /* For each value it means, datasheet of hardware can tell more details
  *
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
new file mode 100644
index 0000000000..ccddc3e2d4
--- /dev/null
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -0,0 +1,1024 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <rte_ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "base/i40e_prototype.h"
+#include "base/i40e_type.h"
+#include "i40e_ethdev.h"
+#include "i40e_rxtx.h"
+#include "i40e_rxtx_vec_common.h"
+
+#include <x86intrin.h>
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+#define RTE_I40E_DESCS_PER_LOOP_AVX 8
+
+static inline void
+i40e_rxq_rearm(struct i40e_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
+			rte_lcore_id());
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* We need to pull 'n' more MBUFs into the software ring from mempool
+	 * We inline the mempool function here, so we can vectorize the copy
+	 * from the cache into the shadow ring.
+	 */
+
+	if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
+		/* No. Backfill the cache first, and then fill from it */
+		uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
+				cache->len);
+
+		/* How many do we require
+		 * i.e. number to fill the cache + the request
+		 */
+		int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
+				&cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+					rxq->nb_rx_desc) {
+				__m128i dma_addr0;
+
+				dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+					rxep[i].mbuf = &rxq->fake_mbuf;
+					_mm_store_si128
+						((__m128i *)&rxdp[i].read,
+							dma_addr0);
+				}
+			}
+			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+					RTE_I40E_RXQ_REARM_THRESH;
+			return;
+		}
+	}
+
+	const __m512i iova_offsets =  _mm512_set1_epi64
+		(offsetof(struct rte_mbuf, buf_iova));
+	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+	/* to shuffle the addresses to correct slots. Values 4-7 will contain
+	 * zeros, so use 7 for a zero-value.
+	 */
+	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
+#else
+	const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
+#endif
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+	 * from mempool cache and populating both shadow and HW rings
+	 */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
+		const __m512i mbuf_ptrs = _mm512_loadu_si512
+			(&cache->objs[cache->len - 8]);
+		_mm512_store_si512(rxep, mbuf_ptrs);
+
+		/* gather iova of mbuf0-7 into one zmm reg */
+		const __m512i iova_base_addrs = _mm512_i64gather_epi64
+			(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+				0, /* base */
+				1 /* scale */);
+		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+				headroom);
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+		const __m512i iovas0 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 0));
+		const __m512i iovas1 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 1));
+
+		/* permute leaves desc 2-3 addresses in header address slots 0-1
+		 * but these are ignored by driver since header split not
+		 * enabled. Similarly for desc 4 & 5.
+		 */
+		const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
+			(permute_idx, iovas0);
+		const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
+
+		const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
+			(permute_idx, iovas1);
+		const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_1);
+		_mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
+		_mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
+#else
+		/* permute leaves desc 4-7 addresses in header address slots 0-3
+		 * but these are ignored by driver since header split not
+		 * enabled.
+		 */
+		const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
+			(permute_idx, iova_addrs);
+		const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
+#endif
+		rxep += 8, rxdp += 8, cache->len -= 8;
+	}
+
+	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* Handles 32B descriptor FDIR ID processing:
+ * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
+ * rx_pkts: required to store metadata back to mbufs
+ * pkt_idx: offset into the burst, increments in vector widths
+ * desc_idx: required to select the correct shift at compile time
+ */
+static inline __m256i
+desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
+			 struct rte_mbuf **rx_pkts,
+			 const uint32_t pkt_idx,
+			 const uint32_t desc_idx)
+{
+	/* 32B desc path: load rxdp.wb.qword2 for EXT_STATUS and FLEXBH_STAT */
+	__m128i *rxdp_desc_0 = (void *)(&rxdp[desc_idx + 0].wb.qword2);
+	__m128i *rxdp_desc_1 = (void *)(&rxdp[desc_idx + 1].wb.qword2);
+	const __m128i desc_qw2_0 = _mm_load_si128(rxdp_desc_0);
+	const __m128i desc_qw2_1 = _mm_load_si128(rxdp_desc_1);
+
+	/* Mask for FLEXBH_STAT, and the FDIR_ID value to compare against. The
+	 * remaining data is set to all 1's to pass through data.
+	 */
+	const __m256i flexbh_mask = _mm256_set_epi32(-1, -1, -1, 3 << 4,
+						     -1, -1, -1, 3 << 4);
+	const __m256i flexbh_id   = _mm256_set_epi32(-1, -1, -1, 1 << 4,
+						     -1, -1, -1, 1 << 4);
+
+	/* Load descriptor, check for FLEXBH bits, generate a mask for both
+	 * packets in the register.
+	 */
+	__m256i desc_qw2_0_1 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(desc_qw2_0),
+					desc_qw2_1, 1);
+	__m256i desc_tmp_msk = _mm256_and_si256(flexbh_mask, desc_qw2_0_1);
+	__m256i fdir_mask = _mm256_cmpeq_epi32(flexbh_id, desc_tmp_msk);
+	__m256i fdir_data = _mm256_alignr_epi8(desc_qw2_0_1, desc_qw2_0_1, 12);
+	__m256i desc_fdir_data = _mm256_and_si256(fdir_mask, fdir_data);
+
+	/* Write data out to the mbuf. There is no store to this area of the
+	 * mbuf today, so we cannot combine it with another store.
+	 */
+	const uint32_t idx_0 = pkt_idx + desc_idx;
+	const uint32_t idx_1 = pkt_idx + desc_idx + 1;
+
+	rx_pkts[idx_0]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 0);
+	rx_pkts[idx_1]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 4);
+
+	/* Create mbuf flags as required for mbuf_flags layout
+	 *  (That's high lane [1,3,5,7, 0,2,4,6] as u32 lanes).
+	 * Approach:
+	 * - Mask away bits not required from the fdir_mask
+	 * - Leave the PKT_FDIR_ID bit (1 << 13)
+	 * - Position that bit correctly based on packet number
+	 * - OR in the resulting bit to mbuf_flags
+	 */
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+	__m256i mbuf_flag_mask = _mm256_set_epi32(0, 0, 0, 1 << 13,
+						  0, 0, 0, 1 << 13);
+	__m256i desc_flag_bit =  _mm256_and_si256(mbuf_flag_mask, fdir_mask);
+
+	/* For static-inline function, this will be stripped out
+	 * as the desc_idx is a hard-coded constant.
+	 */
+	switch (desc_idx) {
+	case 0:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  4);
+	case 2:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  8);
+	case 4:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 12);
+	case 6:
+		return desc_flag_bit;
+	default:
+		break;
+	}
+
+	/* NOT REACHED, see above switch returns */
+	return _mm256_setzero_si256();
+}
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+#define PKTLEN_SHIFT     10
+
+/* Force inline as some compilers will not inline by default. */
+static __rte_always_inline uint16_t
+_recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+			  uint16_t nb_pkts, uint8_t *split_packet)
+{
+	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+			0, rxq->mbuf_initializer);
+	struct i40e_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union i40e_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
+		i40e_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m512i crc_adjust =
+		_mm512_set4_epi32
+			(0,             /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0              /* ignore non-length fields */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			I40E_RX_DESC_STATUS_EOF_SHIFT);
+
+	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set4_epi32
+			(/* rss hash parsed separately */
+			 /* octet 4~7, 32bits rss */
+			 7 << 24 | 6 << 16 | 5 << 8 | 4,
+			 /* octet 2~3, low 16 bits vlan_macip */
+			 /* octet 14~15, 16 bits data_len */
+			 3 << 24 | 2 << 16 | 15 << 8 | 14,
+			 /* skip hi 16 bits pkt_len, zero out */
+			 /* octet 14~15, 16 bits pkt_len */
+			 0xFFFF << 16 | 15 << 8 | 14,
+			 /* pkt_type set as unknown */
+			 0xFFFFFFFF
+			);
+	/* compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Status/Error flag masks */
+	/* mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication. Bits 3-5 of error
+	 * field (bits 22-24) are for IP/L4 checksum errors
+	 */
+	const __m256i flags_mask = _mm256_set1_epi32
+		((1 << 2) | (1 << 11) | (3 << 12) | (7 << 22));
+
+	/* data to be shuffled by result of flag mask. If VLAN bit is set,
+	 * (bit 2), then position 4 in this array will be used in the
+	 * destination
+	 */
+	const __m256i vlan_flags_shuf = _mm256_set_epi32
+		(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+		0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+
+	/* data to be shuffled by result of flag mask, shifted down 11.
+	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+	 * place.
+	 */
+	const __m256i rss_flags_shuf = _mm256_set_epi8
+		(0, 0, 0, 0, 0, 0, 0, 0,
+		PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+		0, 0, PKT_RX_FDIR, 0, /* end up 128-bits */
+		0, 0, 0, 0, 0, 0, 0, 0,
+		PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+		0, 0, PKT_RX_FDIR, 0);
+
+	/* data to be shuffled by the result of the flags mask shifted by 22
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8
+		(0, 0, 0, 0, 0, 0, 0, 0,
+		/* shift right 1 bit to make sure it not exceed 255 */
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+		 PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+		 PKT_RX_L4_CKSUM_BAD) >> 1,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+		(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+		PKT_RX_IP_CKSUM_BAD >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+		/* second 128-bits */
+		0, 0, 0, 0, 0, 0, 0, 0,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+		 PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+		 PKT_RX_L4_CKSUM_BAD) >> 1,
+		(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+		(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+		PKT_RX_IP_CKSUM_BAD >> 1,
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+	const __m256i cksum_mask = _mm256_set1_epi32
+		(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+		PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+		PKT_RX_EIP_CKSUM_BAD);
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+			i += RTE_I40E_DESCS_PER_LOOP_AVX,
+			rxdp += RTE_I40E_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				_mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256((void *)&rx_pkts[i + 4],
+				_mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
+
+		/* load in descriptors, in reverse order */
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc6_7 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc6),
+				 raw_desc7, 1);
+		raw_desc4_5 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc4),
+				 raw_desc5, 1);
+		raw_desc2_3 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc2),
+				 raw_desc3, 1);
+		raw_desc0_1 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc0),
+				 raw_desc1, 1);
+
+		raw_desc4_7 =
+			_mm512_inserti64x4
+				(_mm512_castsi256_si512(raw_desc4_5),
+				 raw_desc6_7, 1);
+		raw_desc0_3 =
+			_mm512_inserti64x4
+				(_mm512_castsi256_si512(raw_desc0_1),
+				 raw_desc2_3, 1);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < RTE_I40E_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/* convert descriptors 0-7 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len4_7 = _mm512_slli_epi32
+					(raw_desc4_7, PKTLEN_SHIFT);
+		const __m512i len0_3 = _mm512_slli_epi32
+					(raw_desc0_3, PKTLEN_SHIFT);
+		const __m512i desc4_7 = _mm512_mask_blend_epi16
+					(0x80808080, raw_desc4_7, len4_7);
+		const __m512i desc0_3 = _mm512_mask_blend_epi16
+					(0x80808080, raw_desc0_3, len0_3);
+		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
+		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
+
+		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+
+		/* to get packet types, shift 64-bit values down 30 bits
+		 * and so ptype is in lower 8-bits in each
+		 */
+		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
+		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
+		const __m256i ptypes6_7 =
+			_mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 =
+			_mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const __m256i ptypes2_3 =
+			_mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 =
+			_mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
+		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
+		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
+		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
+		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
+		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
+		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
+		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, ptype_tbl[ptype7],
+			 0, 0, 0, ptype_tbl[ptype6],
+			 0, 0, 0, ptype_tbl[ptype5],
+			 0, 0, 0, ptype_tbl[ptype4]);
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, ptype_tbl[ptype3],
+			 0, 0, 0, ptype_tbl[ptype2],
+			 0, 0, 0, ptype_tbl[ptype1],
+			 0, 0, 0, ptype_tbl[ptype0]);
+
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+
+		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+		/**
+		 * use permute/extract to get status content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		/* merge the status bits into one register */
+		const __m512i status_permute_msk = _mm512_set_epi32
+			(0, 0, 0, 0,
+			 0, 0, 0, 0,
+			 22, 30, 6, 14,
+			 18, 26, 2, 10);
+		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
+			(desc4_7, status_permute_msk, desc0_3);
+		__m256i status0_7 = _mm512_extracti64x4_epi64
+			(raw_status0_7, 0);
+
+		/* now do flag manipulation */
+
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+		/* set vlan and rss flags */
+		const __m256i vlan_flags =
+			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+		const __m256i rss_fdir_bits = _mm256_srli_epi32(flag_bits, 11);
+		const __m256i rss_flags = _mm256_shuffle_epi8(rss_flags_shuf,
+							      rss_fdir_bits);
+
+		/* l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 22));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+
+		/* merge flags */
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+				_mm256_or_si256(rss_flags, vlan_flags));
+
+		/* If the rxq has FDIR enabled, read and process the FDIR info
+		 * from the descriptor. This can cause more loads/stores, so is
+		 * not always performed. Branch over the code when not enabled.
+		 */
+		if (rxq->fdir_enabled) {
+#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+			/* 16B descriptor code path:
+			 * RSS and FDIR ID use the same offset in the desc, so
+			 * only one can be present at a time. The code below
+			 * identifies an FDIR ID match, and zeros the RSS value
+			 * in the mbuf on FDIR match to keep mbuf data clean.
+			 */
+#define FDIR_BLEND_MASK ((1 << 3) | (1 << 7))
+
+			/* Flags:
+			 * - Take flags, shift bits to null out
+			 * - CMPEQ with known FDIR ID, to get 0xFFFF or 0 mask
+			 * - Strip bits from mask, leaving 0 or 1 for FDIR ID
+			 * - Merge with mbuf_flags
+			 */
+			/* FLM = 1, FLTSTAT = 0b01, (FLM | FLTSTAT) == 3.
+			 * Shift left by 28 to avoid having to mask.
+			 */
+			const __m256i fdir =
+				_mm256_slli_epi32(rss_fdir_bits, 28);
+			const __m256i fdir_id = _mm256_set1_epi32(3 << 28);
+
+			/* As above, the fdir_mask to packet mapping is this:
+			 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+			 * Then OR FDIR flags to mbuf_flags on FDIR ID hit.
+			 */
+			RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+			const __m256i pkt_fdir_bit = _mm256_set1_epi32(1 << 13);
+			const __m256i fdir_mask =
+				_mm256_cmpeq_epi32(fdir, fdir_id);
+			__m256i fdir_bits =
+				_mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_bits);
+
+			/* Based on FDIR_MASK, clear the RSS or FDIR value.
+			 * The FDIR ID value is masked to zero if not a hit,
+			 * otherwise the mb0_1 register RSS field is zeroed.
+			 */
+			const __m256i fdir_zero_mask = _mm256_setzero_si256();
+			__m256i tmp0_1 = _mm256_blend_epi32(fdir_zero_mask,
+						fdir_mask, FDIR_BLEND_MASK);
+			__m256i fdir_mb0_1 = _mm256_and_si256(mb0_1, fdir_mask);
+
+			mb0_1 = _mm256_andnot_si256(tmp0_1, mb0_1);
+
+			/* Write to mbuf: no stores to combine with, so just a
+			 * scalar store to push data here.
+			 */
+			rx_pkts[i + 0]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb0_1, 3);
+			rx_pkts[i + 1]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb0_1, 7);
+
+			/* Same as above, only shift the fdir_mask to align
+			 * the packet FDIR mask with the FDIR_ID desc lane.
+			 */
+			__m256i tmp2_3 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 12);
+			__m256i fdir_mb2_3 = _mm256_and_si256(mb2_3, tmp2_3);
+
+			tmp2_3 = _mm256_blend_epi32(fdir_zero_mask, tmp2_3,
+						    FDIR_BLEND_MASK);
+			mb2_3 = _mm256_andnot_si256(tmp2_3, mb2_3);
+			rx_pkts[i + 2]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb2_3, 3);
+			rx_pkts[i + 3]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb2_3, 7);
+
+			__m256i tmp4_5 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 8);
+			__m256i fdir_mb4_5 = _mm256_and_si256(mb4_5, tmp4_5);
+
+			tmp4_5 = _mm256_blend_epi32(fdir_zero_mask, tmp4_5,
+						    FDIR_BLEND_MASK);
+			mb4_5 = _mm256_andnot_si256(tmp4_5, mb4_5);
+			rx_pkts[i + 4]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb4_5, 3);
+			rx_pkts[i + 5]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb4_5, 7);
+
+			__m256i tmp6_7 =
+				_mm256_alignr_epi8(fdir_mask, fdir_mask, 4);
+			__m256i fdir_mb6_7 = _mm256_and_si256(mb6_7, tmp6_7);
+
+			tmp6_7 = _mm256_blend_epi32(fdir_zero_mask, tmp6_7,
+						    FDIR_BLEND_MASK);
+			mb6_7 = _mm256_andnot_si256(tmp6_7, mb6_7);
+			rx_pkts[i + 6]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb6_7, 3);
+			rx_pkts[i + 7]->hash.fdir.hi =
+				_mm256_extract_epi32(fdir_mb6_7, 7);
+
+			/* End of 16B descriptor handling */
+#else
+			/* 32B descriptor FDIR ID mark handling. Returns bits
+			 * to be OR-ed into the mbuf olflags.
+			 */
+			__m256i fdir_add_flags;
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 0);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 2);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 4);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags =
+				desc_fdir_processing_32b(rxdp, rx_pkts, i, 6);
+			mbuf_flags =
+				_mm256_or_si256(mbuf_flags, fdir_add_flags);
+			/* End 32B desc handling */
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+		} /* if() on FDIR enabled */
+
+		/* At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init data
+		 * so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend for
+		 * each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+				rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(mbuf_flags, 8), 0x04);
+		rearm4 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(mbuf_flags, 4), 0x04);
+		rearm2 = _mm256_blend_epi32
+			(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32
+			(mbuf_init, _mm256_srli_si256(mbuf_flags, 4), 0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm6);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm4);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags = _mm256_castsi128_si256
+			(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(odd_flags, 8), 0x04);
+		rearm5 = _mm256_blend_epi32
+			(mbuf_init, _mm256_slli_si256(odd_flags, 4), 0x04);
+		rearm3 = _mm256_blend_epi32
+			(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32
+			(mbuf_init, _mm256_srli_si256(odd_flags, 4), 0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm7);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm5);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
+		_mm256_storeu_si256
+			((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16
+				(1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
+			const __m256i eop_bits256 =
+				_mm256_and_si256(status0_7, eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+				(_mm256_castsi256_si128(eop_bits256),
+				_mm256_extractf128_si256(eop_bits256, 1));
+			/* flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+					eop_mask);
+			/* eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle = _mm_set_epi8
+				(0xFF, 0xFF, 0xFF, 0xFF, /* zero hi 64b */
+				0xFF, 0xFF, 0xFF, 0xFF,
+				8, 0, 10, 2, /* move values to lo 64b */
+				12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += RTE_I40E_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32
+			(status0_7, _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_extracti128_si256
+						(status0_7, 1)));
+		burst += __builtin_popcountll(_mm_cvtsi128_si64
+				(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != RTE_I40E_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+			  uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec_avx512(void *rx_queue,
+				     struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts)
+{
+	struct i40e_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
+			split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+		&split_flags[i]);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
+				    struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst = i40e_recv_scattered_burst_vec_avx512(rx_queue,
+				rx_pkts + retval, RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+	return retval + i40e_recv_scattered_burst_vec_avx512(rx_queue,
+				rx_pkts + retval, nb_pkts);
+}
+
+static inline void
+vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
+		((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+				pkt->buf_iova + pkt->data_off);
+	_mm_store_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+vtx(volatile struct i40e_tx_desc *txdp,
+	struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
+			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		vtx1(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do two at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
+
+		__m256i desc2_3 = _mm256_set_epi64x
+			(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
+		__m256i desc0_1 = _mm256_set_epi64x
+			(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm256_store_si256((void *)(txdp + 2), desc2_3);
+		_mm256_store_si256((void *)txdp, desc0_1);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		vtx1(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static inline uint16_t
+i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+				 uint16_t nb_pkts)
+{
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+	volatile struct i40e_tx_desc *txdp;
+	struct i40e_tx_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = I40E_TD_CMD;
+	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		i40e_tx_free_bufs(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry(txep, tx_pkts, n);
+
+		vtx(txdp, tx_pkts, n - 1, flags);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = &txq->sw_ring[tx_id];
+	}
+
+	tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
+						I40E_TXD_QW1_CMD_SHIFT);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+i40e_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = i40e_xmit_fixed_burst_vec_avx512
+				(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index 882168c256..e4eb925249 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -45,6 +45,30 @@ if arch_subdir == 'x86'
 				c_args: [cflags, '-mavx2'])
 		objs += i40e_avx2_lib.extract_objects('i40e_rxtx_vec_avx2.c')
 	endif
+
+	i40e_avx512_cpu_support = (
+		cc.get_define('__AVX512F__', args: machine_args) != '' and
+		cc.get_define('__AVX512BW__', args: machine_args) != '')
+
+	i40e_avx512_cc_support = (
+		not machine_args.contains('-mno-avx512f') and
+		cc.has_argument('-mavx512f') and
+		cc.has_argument('-mavx512bw'))
+
+	if i40e_avx512_cpu_support == true or i40e_avx512_cc_support == true
+		cflags += ['-DCC_AVX512_SUPPORT']
+		avx512_args = [cflags, '-mavx512f', '-mavx512bw']
+		if cc.has_argument('-march=skylake-avx512')
+			avx512_args += '-march=skylake-avx512'
+		endif
+		i40e_avx512_lib = static_library('i40e_avx512_lib',
+				'i40e_rxtx_vec_avx512.c',
+				dependencies: [static_rte_ethdev,
+					static_rte_kvargs, static_rte_hash],
+				include_directories: includes,
+				c_args: avx512_args)
+		objs += i40e_avx512_lib.extract_objects('i40e_rxtx_vec_avx512.c')
+	endif
 elif arch_subdir == 'ppc'
        dpdk_conf.set('RTE_LIBRTE_I40E_INC_VECTOR', 1)
        sources += files('i40e_rxtx_vec_altivec.c')
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v3 3/3] net/i40e: optimize Tx by using AVX512
  2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 2/3] net/i40e: add AVX512 vector path Leyi Rong
@ 2021-01-14  6:39   ` Leyi Rong
  2021-01-14  7:37   ` [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e Zhang, Qi Z
  3 siblings, 0 replies; 42+ messages in thread
From: Leyi Rong @ 2021-01-14  6:39 UTC (permalink / raw)
  To: qi.z.zhang, wenzhuo.lu, ferruh.yigit, bruce.richardson, beilei.xing
  Cc: dev, Leyi Rong

Optimize Tx path by using AVX512 instructions and vectorize the
tx free bufs process.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |  19 +++
 drivers/net/i40e/i40e_rxtx.h            |   4 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 152 ++++++++++++++++++++----
 3 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index c99c051306..194bc3571f 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2508,6 +2508,25 @@ i40e_tx_queue_release_mbufs(struct i40e_tx_queue *txq)
 	 *  vPMD tx will not set sw_ring's mbuf to NULL after free,
 	 *  so need to free remains more carefully.
 	 */
+#ifdef CC_AVX512_SUPPORT
+	if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx512) {
+		struct i40e_vec_tx_entry *swr = (void *)txq->sw_ring;
+
+		i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
+		if (txq->tx_tail < i) {
+			for (; i < txq->nb_tx_desc; i++) {
+				rte_pktmbuf_free_seg(swr[i].mbuf);
+				swr[i].mbuf = NULL;
+			}
+			i = 0;
+		}
+		for (; i < txq->tx_tail; i++) {
+			rte_pktmbuf_free_seg(swr[i].mbuf);
+			swr[i].mbuf = NULL;
+		}
+		return;
+	}
+#endif
 	if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx2 ||
 			dev->tx_pkt_burst == i40e_xmit_pkts_vec) {
 		i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 2e3e50eb79..2f55073c97 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -129,6 +129,10 @@ struct i40e_tx_entry {
 	uint16_t last_id;
 };
 
+struct i40e_vec_tx_entry {
+	struct rte_mbuf *mbuf;
+};
+
 /*
  * Structure associated with each TX queue.
  */
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index ccddc3e2d4..43e939c605 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -873,6 +873,115 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
 				rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline int
+i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
+{
+	struct i40e_vec_tx_entry *txep;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
+
+	/* check DD bits on threshold descriptor */
+	if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE))
+		return 0;
+
+	n = txq->tx_rs_thresh;
+
+	 /* first buffer to free from S/W ring is at index
+	  * tx_next_dd - (tx_rs_thresh-1)
+	  */
+	txep = (void *)txq->sw_ring;
+	txep += txq->tx_next_dd - (n - 1);
+
+	if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
+		struct rte_mempool *mp = txep[0].mbuf->pool;
+		void **cache_objs;
+		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
+				rte_lcore_id());
+
+		if (!cache || cache->len == 0)
+			goto normal;
+
+		cache_objs = &cache->objs[cache->len];
+
+		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+			goto done;
+		}
+
+		/* The cache follows the following algorithm
+		 *   1. Add the objects to the cache
+		 *   2. Anything greater than the cache min value (if it
+		 *   crosses the cache flush threshold) is flushed to the ring.
+		 */
+		/* Add elements back into the cache */
+		uint32_t copied = 0;
+		/* n is multiple of 32 */
+		while (copied < n) {
+			const __m512i a = _mm512_load_si512(&txep[copied]);
+			const __m512i b = _mm512_load_si512(&txep[copied + 8]);
+			const __m512i c = _mm512_load_si512(&txep[copied + 16]);
+			const __m512i d = _mm512_load_si512(&txep[copied + 24]);
+
+			_mm512_storeu_si512(&cache_objs[copied], a);
+			_mm512_storeu_si512(&cache_objs[copied + 8], b);
+			_mm512_storeu_si512(&cache_objs[copied + 16], c);
+			_mm512_storeu_si512(&cache_objs[copied + 24], d);
+			copied += 32;
+		}
+		cache->len += n;
+
+		if (cache->len >= cache->flushthresh) {
+			rte_mempool_ops_enqueue_bulk
+				(mp, &cache->objs[cache->size],
+				cache->len - cache->size);
+			cache->len = cache->size;
+		}
+		goto done;
+	}
+
+normal:
+	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			rte_prefetch0(&txep[i + 3].mbuf->cacheline1);
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m)) {
+				if (likely(m->pool == free[0]->pool)) {
+					free[nb_free++] = m;
+				} else {
+					rte_mempool_put_bulk(free[0]->pool,
+							     (void *)free,
+							     nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+done:
+	/* buffers were freed, update counters */
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+	if (txq->tx_next_dd >= txq->nb_tx_desc)
+		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	return txq->tx_rs_thresh;
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
@@ -892,13 +1001,6 @@ vtx(volatile struct i40e_tx_desc *txdp,
 	const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT));
 
-	/* if unaligned on 32-bit boundary, do one to align */
-	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		vtx1(txdp, *pkt, flags);
-		nb_pkts--, txdp++, pkt++;
-	}
-
-	/* do two at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
 		uint64_t hi_qw3 =
 			hi_qw_tmpl |
@@ -917,14 +1019,13 @@ vtx(volatile struct i40e_tx_desc *txdp,
 			((uint64_t)pkt[0]->data_len <<
 			 I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
 
-		__m256i desc2_3 = _mm256_set_epi64x
+		__m512i desc0_3 =
+			_mm512_set_epi64
 			(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
-			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
-		__m256i desc0_1 = _mm256_set_epi64x
-			(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off,
+			hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
 			hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
-		_mm256_store_si256((void *)(txdp + 2), desc2_3);
-		_mm256_store_si256((void *)txdp, desc0_1);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
@@ -934,13 +1035,23 @@ vtx(volatile struct i40e_tx_desc *txdp,
 	}
 }
 
+static __rte_always_inline void
+tx_backlog_entry_avx512(struct i40e_vec_tx_entry *txep,
+			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	int i;
+
+	for (i = 0; i < (int)nb_pkts; ++i)
+		txep[i].mbuf = tx_pkts[i];
+}
+
 static inline uint16_t
 i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				 uint16_t nb_pkts)
 {
 	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
 	volatile struct i40e_tx_desc *txdp;
-	struct i40e_tx_entry *txep;
+	struct i40e_vec_tx_entry *txep;
 	uint16_t n, nb_commit, tx_id;
 	uint64_t flags = I40E_TD_CMD;
 	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
@@ -949,7 +1060,7 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 
 	if (txq->nb_tx_free < txq->tx_free_thresh)
-		i40e_tx_free_bufs(txq);
+		i40e_tx_free_bufs_avx512(txq);
 
 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
 	if (unlikely(nb_pkts == 0))
@@ -957,13 +1068,14 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	tx_id = txq->tx_tail;
 	txdp = &txq->tx_ring[tx_id];
-	txep = &txq->sw_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
 
 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 
 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
 	if (nb_commit >= n) {
-		tx_backlog_entry(txep, tx_pkts, n);
+		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
 		vtx(txdp, tx_pkts, n - 1, flags);
 		tx_pkts += (n - 1);
@@ -977,11 +1089,11 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 
 		/* avoid reach the end of ring */
-		txdp = &txq->tx_ring[tx_id];
-		txep = &txq->sw_ring[tx_id];
+		txdp = txq->tx_ring;
+		txep = (void *)txq->sw_ring;
 	}
 
-	tx_backlog_entry(txep, tx_pkts, nb_commit);
+	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
 	vtx(txdp, tx_pkts, nb_commit, flags);
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
                     ` (2 preceding siblings ...)
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
@ 2021-01-14  7:37   ` Zhang, Qi Z
  2021-01-17 11:26     ` Odi Assli
  3 siblings, 1 reply; 42+ messages in thread
From: Zhang, Qi Z @ 2021-01-14  7:37 UTC (permalink / raw)
  To: Rong, Leyi, Lu, Wenzhuo, Yigit, Ferruh, Richardson, Bruce, Xing, Beilei
  Cc: dev



> -----Original Message-----
> From: Rong, Leyi <leyi.rong@intel.com>
> Sent: Thursday, January 14, 2021 2:40 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>;
> Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; Rong, Leyi <leyi.rong@intel.com>
> Subject: [PATCH v3 0/3] AVX512 vPMD on i40e
> 
> This patchset aims to support AVX512 vPMD on i40e.
> And the changes are only target to AVX512 vector path.
> 
> ---
> v3:
> - Extract get_avx_supported() to get the proper vector data path to choose.
> 
> v2:
> - Add return value check on rte_mempool_default_cache().
> 
> 
> Leyi Rong (3):
>   net/i40e: remove devarg use-latest-supported-vec
>   net/i40e: add AVX512 vector path
>   net/i40e: optimize Tx by using AVX512
> 
>  doc/guides/nics/i40e.rst                |    9 -
>  drivers/net/i40e/i40e_ethdev.c          |   63 +-
>  drivers/net/i40e/i40e_ethdev.h          |    3 -
>  drivers/net/i40e/i40e_rxtx.c            |  196 ++--
>  drivers/net/i40e/i40e_rxtx.h            |   13 +
>  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136 +++++++++++++++++++++++
>  drivers/net/i40e/meson.build            |   24 +
>  7 files changed, 1301 insertions(+), 143 deletions(-)  create mode 100644
> drivers/net/i40e/i40e_rxtx_vec_avx512.c
> 
> --
> 2.17.1


Applied to dpdk-next-net-intel after revert v1.

Thanks
Qi

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec
  2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
@ 2021-01-15 13:36     ` Ferruh Yigit
  0 siblings, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-15 13:36 UTC (permalink / raw)
  To: Leyi Rong, qi.z.zhang, wenzhuo.lu, bruce.richardson, beilei.xing; +Cc: dev

On 1/14/2021 6:39 AM, Leyi Rong wrote:
> As eal parameter --force-max-simd-bitwidth is already introduced,
> to make it more clear when setting rx/tx function, remove
> devarg use-latest-supported-vec support.
> 
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>
> Acked-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

<...>

> -static eth_rx_burst_t
> -i40e_get_recommend_rx_vec(bool scatter)
> +static inline bool
> +get_avx_supported(bool request_avx512)
>   {
> -#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
> -	/*
> -	 * since AVX frequency can be different to base frequency, limit
> -	 * use of AVX2 version to later plaforms, not all those that could
> -	 * theoretically run it.
> -	 */
> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
> -			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
> -		return scatter ? i40e_recv_scattered_pkts_vec_avx2 :
> -				 i40e_recv_pkts_vec_avx2;
> +#ifdef RTE_ARCH_X86
> +	if (request_avx512) {
> +		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
> +		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
> +		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
> +#ifdef CC_AVX512_SUPPORT
> +			return true;
> +#else
> +		PMD_DRV_LOG(NOTICE,
> +			"AVX512 is not supported in build env");
> +		return false;
> +#endif
> +	} else {
> +		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
> +		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 &&
> +		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
> +#ifdef CC_AVX2_SUPPORT
> +			return true;
> +#else
> +		PMD_DRV_LOG(NOTICE,
> +			"AVX2 is not supported in build env");
> +		return false;
>   #endif
> -	return scatter ? i40e_recv_scattered_pkts_vec :
> -			 i40e_recv_pkts_vec;
> +	}
> +#endif /* RTE_ARCH_X86 */
> +
> +	return false;
>   }

This was still causing build error for non x86, becuase of not used 
'request_avx512' variable, updated as below in next-net:

  diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
  index a4661cdd8377..ce2b0bc12ed9 100644
  --- a/drivers/net/i40e/i40e_rxtx.c
  +++ b/drivers/net/i40e/i40e_rxtx.c
  @@ -3122,6 +3122,8 @@ get_avx_supported(bool request_avx512)
                  return false;
   #endif
          }
  +#else
  +       RTE_SET_USED(request_avx512);
   #endif /* RTE_ARCH_X86 */

          return false;

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-14  7:37   ` [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e Zhang, Qi Z
@ 2021-01-17 11:26     ` Odi Assli
  2021-01-18 13:58       ` Rong, Leyi
  0 siblings, 1 reply; 42+ messages in thread
From: Odi Assli @ 2021-01-17 11:26 UTC (permalink / raw)
  To: Zhang, Qi Z, Rong, Leyi, Lu, Wenzhuo, Yigit, Ferruh, Richardson,
	Bruce, Xing, Beilei, pallavi.kadam, Ranjit Menon
  Cc: dev, Tal Shnaiderman, NBU-Contact-Thomas Monjalon,
	Raslan Darawsheh, Ali Alnubani

> > Subject: [PATCH v3 0/3] AVX512 vPMD on i40e
> >
> > This patchset aims to support AVX512 vPMD on i40e.
> > And the changes are only target to AVX512 vector path.
> >
> > ---
> > v3:
> > - Extract get_avx_supported() to get the proper vector data path to
> choose.
> >
> > v2:
> > - Add return value check on rte_mempool_default_cache().
> >
> >
> > Leyi Rong (3):
> >   net/i40e: remove devarg use-latest-supported-vec
> >   net/i40e: add AVX512 vector path
> >   net/i40e: optimize Tx by using AVX512
> >
> >  doc/guides/nics/i40e.rst                |    9 -
> >  drivers/net/i40e/i40e_ethdev.c          |   63 +-
> >  drivers/net/i40e/i40e_ethdev.h          |    3 -
> >  drivers/net/i40e/i40e_rxtx.c            |  196 ++--
> >  drivers/net/i40e/i40e_rxtx.h            |   13 +
> >  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136
> +++++++++++++++++++++++
> >  drivers/net/i40e/meson.build            |   24 +
> >  7 files changed, 1301 insertions(+), 143 deletions(-)  create mode
> > 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c
> >
> > --
> > 2.17.1
> 
> 
> Applied to dpdk-next-net-intel after revert v1.
> 
> Thanks
> Qi

Hi,
This patch series broke i40e compilation on windows with failure in undefined __m_prefetchw function 
Please see details in bug: 619 [1].

[1]: https://bugs.dpdk.org/show_bug.cgi?id=619



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-17 11:26     ` Odi Assli
@ 2021-01-18 13:58       ` Rong, Leyi
  2021-01-18 14:24         ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Rong, Leyi @ 2021-01-18 13:58 UTC (permalink / raw)
  To: Odi Assli, Zhang, Qi Z, Lu, Wenzhuo, Yigit, Ferruh, Richardson,
	Bruce, Xing, Beilei, Kadam, Pallavi, Menon, Ranjit
  Cc: dev, Tal Shnaiderman, NBU-Contact-Thomas Monjalon,
	Raslan Darawsheh, Ali Alnubani


> -----Original Message-----
> From: Odi Assli <odia@nvidia.com>
> Sent: Sunday, January 17, 2021 7:26 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Rong, Leyi <leyi.rong@intel.com>; Lu,
> Wenzhuo <wenzhuo.lu@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Kadam, Pallavi <pallavi.kadam@intel.com>; Menon,
> Ranjit <ranjit.menon@intel.com>
> Cc: dev@dpdk.org; Tal Shnaiderman <talshn@nvidia.com>; NBU-Contact-
> Thomas Monjalon <thomas@monjalon.net>; Raslan Darawsheh
> <rasland@nvidia.com>; Ali Alnubani <alialnu@nvidia.com>
> Subject: RE: [PATCH v3 0/3] AVX512 vPMD on i40e
> 
> > > Subject: [PATCH v3 0/3] AVX512 vPMD on i40e
> > >
> > > This patchset aims to support AVX512 vPMD on i40e.
> > > And the changes are only target to AVX512 vector path.
> > >
> > > ---
> > > v3:
> > > - Extract get_avx_supported() to get the proper vector data path to
> > choose.
> > >
> > > v2:
> > > - Add return value check on rte_mempool_default_cache().
> > >
> > >
> > > Leyi Rong (3):
> > >   net/i40e: remove devarg use-latest-supported-vec
> > >   net/i40e: add AVX512 vector path
> > >   net/i40e: optimize Tx by using AVX512
> > >
> > >  doc/guides/nics/i40e.rst                |    9 -
> > >  drivers/net/i40e/i40e_ethdev.c          |   63 +-
> > >  drivers/net/i40e/i40e_ethdev.h          |    3 -
> > >  drivers/net/i40e/i40e_rxtx.c            |  196 ++--
> > >  drivers/net/i40e/i40e_rxtx.h            |   13 +
> > >  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136
> > +++++++++++++++++++++++
> > >  drivers/net/i40e/meson.build            |   24 +
> > >  7 files changed, 1301 insertions(+), 143 deletions(-)  create mode
> > > 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c
> > >
> > > --
> > > 2.17.1
> >
> >
> > Applied to dpdk-next-net-intel after revert v1.
> >
> > Thanks
> > Qi
> 
> Hi,
> This patch series broke i40e compilation on windows with failure in undefined
> __m_prefetchw function Please see details in bug: 619 [1].
> 
> [1]: https://bugs.dpdk.org/show_bug.cgi?id=619
> 

Hi Ferruh,

Could you help to squash my fix patch into the patch series? As I've verified it works locally.
 
Hi Assli,

Could you help to check after Ferruh squashed the fix patch into the next-net?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-18 13:58       ` Rong, Leyi
@ 2021-01-18 14:24         ` Ferruh Yigit
  2021-01-18 14:53           ` Odi Assli
  0 siblings, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-18 14:24 UTC (permalink / raw)
  To: Rong, Leyi, Odi Assli, Zhang, Qi Z, Lu, Wenzhuo, Richardson,
	Bruce, Xing, Beilei, Kadam, Pallavi, Menon, Ranjit
  Cc: dev, Tal Shnaiderman, NBU-Contact-Thomas Monjalon,
	Raslan Darawsheh, Ali Alnubani

On 1/18/2021 1:58 PM, Rong, Leyi wrote:
> 
>> -----Original Message-----
>> From: Odi Assli <odia@nvidia.com>
>> Sent: Sunday, January 17, 2021 7:26 PM
>> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Rong, Leyi <leyi.rong@intel.com>; Lu,
>> Wenzhuo <wenzhuo.lu@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>;
>> Richardson, Bruce <bruce.richardson@intel.com>; Xing, Beilei
>> <beilei.xing@intel.com>; Kadam, Pallavi <pallavi.kadam@intel.com>; Menon,
>> Ranjit <ranjit.menon@intel.com>
>> Cc: dev@dpdk.org; Tal Shnaiderman <talshn@nvidia.com>; NBU-Contact-
>> Thomas Monjalon <thomas@monjalon.net>; Raslan Darawsheh
>> <rasland@nvidia.com>; Ali Alnubani <alialnu@nvidia.com>
>> Subject: RE: [PATCH v3 0/3] AVX512 vPMD on i40e
>>
>>>> Subject: [PATCH v3 0/3] AVX512 vPMD on i40e
>>>>
>>>> This patchset aims to support AVX512 vPMD on i40e.
>>>> And the changes are only target to AVX512 vector path.
>>>>
>>>> ---
>>>> v3:
>>>> - Extract get_avx_supported() to get the proper vector data path to
>>> choose.
>>>>
>>>> v2:
>>>> - Add return value check on rte_mempool_default_cache().
>>>>
>>>>
>>>> Leyi Rong (3):
>>>>    net/i40e: remove devarg use-latest-supported-vec
>>>>    net/i40e: add AVX512 vector path
>>>>    net/i40e: optimize Tx by using AVX512
>>>>
>>>>   doc/guides/nics/i40e.rst                |    9 -
>>>>   drivers/net/i40e/i40e_ethdev.c          |   63 +-
>>>>   drivers/net/i40e/i40e_ethdev.h          |    3 -
>>>>   drivers/net/i40e/i40e_rxtx.c            |  196 ++--
>>>>   drivers/net/i40e/i40e_rxtx.h            |   13 +
>>>>   drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136
>>> +++++++++++++++++++++++
>>>>   drivers/net/i40e/meson.build            |   24 +
>>>>   7 files changed, 1301 insertions(+), 143 deletions(-)  create mode
>>>> 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c
>>>>
>>>> --
>>>> 2.17.1
>>>
>>>
>>> Applied to dpdk-next-net-intel after revert v1.
>>>
>>> Thanks
>>> Qi
>>
>> Hi,
>> This patch series broke i40e compilation on windows with failure in undefined
>> __m_prefetchw function Please see details in bug: 619 [1].
>>
>> [1]: https://bugs.dpdk.org/show_bug.cgi?id=619
>>
> 
> Hi Ferruh,
> 
> Could you help to squash my fix patch into the patch series? As I've verified it works locally.
> 
> Hi Assli,
> 
> Could you help to check after Ferruh squashed the fix patch into the next-net?
> 

Hi Assli,

The other-way around is safer perhaps, can you please test with the patch:
https://patches.dpdk.org/patch/86804/

If it is good, I can squash it in the next-net.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-18 14:24         ` Ferruh Yigit
@ 2021-01-18 14:53           ` Odi Assli
  2021-01-18 16:36             ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Odi Assli @ 2021-01-18 14:53 UTC (permalink / raw)
  To: Ferruh Yigit, Rong, Leyi, Zhang, Qi Z, Lu, Wenzhuo, Richardson,
	Bruce, Xing, Beilei, Kadam, Pallavi, Menon, Ranjit
  Cc: dev, Tal Shnaiderman, NBU-Contact-Thomas Monjalon,
	Raslan Darawsheh, Ali Alnubani

> Subject: Re: [PATCH v3 0/3] AVX512 vPMD on i40e
> 
> External email: Use caution opening links or attachments
> 
> 
> On 1/18/2021 1:58 PM, Rong, Leyi wrote:
> >
> >> -----Original Message-----
> >> From: Odi Assli <odia@nvidia.com>
> >> Sent: Sunday, January 17, 2021 7:26 PM
> >> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Rong, Leyi
> >> <leyi.rong@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Yigit,
> >> Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> >> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
> >> Kadam, Pallavi <pallavi.kadam@intel.com>; Menon, Ranjit
> >> <ranjit.menon@intel.com>
> >> Cc: dev@dpdk.org; Tal Shnaiderman <talshn@nvidia.com>; NBU-Contact-
> >> Thomas Monjalon <thomas@monjalon.net>; Raslan Darawsheh
> >> <rasland@nvidia.com>; Ali Alnubani <alialnu@nvidia.com>
> >> Subject: RE: [PATCH v3 0/3] AVX512 vPMD on i40e
> >>
> >>>> Subject: [PATCH v3 0/3] AVX512 vPMD on i40e
> >>>>
> >>>> This patchset aims to support AVX512 vPMD on i40e.
> >>>> And the changes are only target to AVX512 vector path.
> >>>>
> >>>> ---
> >>>> v3:
> >>>> - Extract get_avx_supported() to get the proper vector data path to
> >>> choose.
> >>>>
> >>>> v2:
> >>>> - Add return value check on rte_mempool_default_cache().
> >>>>
> >>>>
> >>>> Leyi Rong (3):
> >>>>    net/i40e: remove devarg use-latest-supported-vec
> >>>>    net/i40e: add AVX512 vector path
> >>>>    net/i40e: optimize Tx by using AVX512
> >>>>
> >>>>   doc/guides/nics/i40e.rst                |    9 -
> >>>>   drivers/net/i40e/i40e_ethdev.c          |   63 +-
> >>>>   drivers/net/i40e/i40e_ethdev.h          |    3 -
> >>>>   drivers/net/i40e/i40e_rxtx.c            |  196 ++--
> >>>>   drivers/net/i40e/i40e_rxtx.h            |   13 +
> >>>>   drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136
> >>> +++++++++++++++++++++++
> >>>>   drivers/net/i40e/meson.build            |   24 +
> >>>>   7 files changed, 1301 insertions(+), 143 deletions(-)  create
> >>>> mode
> >>>> 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c
> >>>>
> >>>> --
> >>>> 2.17.1
> >>>
> >>>
> >>> Applied to dpdk-next-net-intel after revert v1.
> >>>
> >>> Thanks
> >>> Qi
> >>
> >> Hi,
> >> This patch series broke i40e compilation on windows with failure in
> >> undefined __m_prefetchw function Please see details in bug: 619 [1].
> >>
> >> [1]:
> >>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbug
> >>
> s.dpdk.org%2Fshow_bug.cgi%3Fid%3D619&amp;data=04%7C01%7Codia%40
> nvidia
> >>
> .com%7C4b212e1ae3d045b6264108d8bbbd7703%7C43083d15727340c1b7db3
> 9efd9c
> >>
> cc17a%7C0%7C0%7C637465769725929951%7CUnknown%7CTWFpbGZsb3d8e
> yJWIjoiMC
> >>
> 4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&
> amp;s
> >>
> data=oSmKjG6Za9DCSthbn72YfsQ1JsrUoaVSUCLNt7c5R4o%3D&amp;reserve
> d=0
> >>
> >
> > Hi Ferruh,
> >
> > Could you help to squash my fix patch into the patch series? As I've verified
> it works locally.
> >
> > Hi Assli,
> >
> > Could you help to check after Ferruh squashed the fix patch into the next-
> net?
> >
> 
> Hi Assli,
> 
> The other-way around is safer perhaps, can you please test with the patch:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatc
> hes.dpdk.org%2Fpatch%2F86804%2F&amp;data=04%7C01%7Codia%40nvidia
> .com%7C4b212e1ae3d045b6264108d8bbbd7703%7C43083d15727340c1b7db3
> 9efd9ccc17a%7C0%7C0%7C637465769725939943%7CUnknown%7CTWFpbGZs
> b3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn
> 0%3D%7C1000&amp;sdata=ZQU7m8zNXw%2BKxeG6AcveYp9Dpa4k%2BdeO
> 2hK5DiBT3lg%3D&amp;reserved=0
> 
> If it is good, I can squash it in the next-net.
> 
> Thanks,
> ferruh

Hi Guys,

I tested it in my side and it works
Can you please the Bugzilla bug 619?


Regards,
Odi. 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-18 14:53           ` Odi Assli
@ 2021-01-18 16:36             ` Ferruh Yigit
  2021-01-19 13:46               ` Ali Alnubani
  0 siblings, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-18 16:36 UTC (permalink / raw)
  To: Odi Assli, Rong, Leyi, Zhang, Qi Z, Lu, Wenzhuo, Richardson,
	Bruce, Xing, Beilei, Kadam, Pallavi, Menon, Ranjit
  Cc: dev, Tal Shnaiderman, NBU-Contact-Thomas Monjalon,
	Raslan Darawsheh, Ali Alnubani

On 1/18/2021 2:53 PM, Odi Assli wrote:
>> Subject: Re: [PATCH v3 0/3] AVX512 vPMD on i40e
>>
>> External email: Use caution opening links or attachments
>>
>>
>> On 1/18/2021 1:58 PM, Rong, Leyi wrote:
>>>
>>>> -----Original Message-----
>>>> From: Odi Assli <odia@nvidia.com>
>>>> Sent: Sunday, January 17, 2021 7:26 PM
>>>> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Rong, Leyi
>>>> <leyi.rong@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Yigit,
>>>> Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
>>>> <bruce.richardson@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
>>>> Kadam, Pallavi <pallavi.kadam@intel.com>; Menon, Ranjit
>>>> <ranjit.menon@intel.com>
>>>> Cc: dev@dpdk.org; Tal Shnaiderman <talshn@nvidia.com>; NBU-Contact-
>>>> Thomas Monjalon <thomas@monjalon.net>; Raslan Darawsheh
>>>> <rasland@nvidia.com>; Ali Alnubani <alialnu@nvidia.com>
>>>> Subject: RE: [PATCH v3 0/3] AVX512 vPMD on i40e
>>>>
>>>>>> Subject: [PATCH v3 0/3] AVX512 vPMD on i40e
>>>>>>
>>>>>> This patchset aims to support AVX512 vPMD on i40e.
>>>>>> And the changes are only target to AVX512 vector path.
>>>>>>
>>>>>> ---
>>>>>> v3:
>>>>>> - Extract get_avx_supported() to get the proper vector data path to
>>>>> choose.
>>>>>>
>>>>>> v2:
>>>>>> - Add return value check on rte_mempool_default_cache().
>>>>>>
>>>>>>
>>>>>> Leyi Rong (3):
>>>>>>     net/i40e: remove devarg use-latest-supported-vec
>>>>>>     net/i40e: add AVX512 vector path
>>>>>>     net/i40e: optimize Tx by using AVX512
>>>>>>
>>>>>>    doc/guides/nics/i40e.rst                |    9 -
>>>>>>    drivers/net/i40e/i40e_ethdev.c          |   63 +-
>>>>>>    drivers/net/i40e/i40e_ethdev.h          |    3 -
>>>>>>    drivers/net/i40e/i40e_rxtx.c            |  196 ++--
>>>>>>    drivers/net/i40e/i40e_rxtx.h            |   13 +
>>>>>>    drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1136
>>>>> +++++++++++++++++++++++
>>>>>>    drivers/net/i40e/meson.build            |   24 +
>>>>>>    7 files changed, 1301 insertions(+), 143 deletions(-)  create
>>>>>> mode
>>>>>> 100644 drivers/net/i40e/i40e_rxtx_vec_avx512.c
>>>>>>
>>>>>> --
>>>>>> 2.17.1
>>>>>
>>>>>
>>>>> Applied to dpdk-next-net-intel after revert v1.
>>>>>
>>>>> Thanks
>>>>> Qi
>>>>
>>>> Hi,
>>>> This patch series broke i40e compilation on windows with failure in
>>>> undefined __m_prefetchw function Please see details in bug: 619 [1].
>>>>
>>>> [1]:
>>>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbug
>>>>
>> s.dpdk.org%2Fshow_bug.cgi%3Fid%3D619&amp;data=04%7C01%7Codia%40
>> nvidia
>>>>
>> .com%7C4b212e1ae3d045b6264108d8bbbd7703%7C43083d15727340c1b7db3
>> 9efd9c
>>>>
>> cc17a%7C0%7C0%7C637465769725929951%7CUnknown%7CTWFpbGZsb3d8e
>> yJWIjoiMC
>>>>
>> 4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&
>> amp;s
>>>>
>> data=oSmKjG6Za9DCSthbn72YfsQ1JsrUoaVSUCLNt7c5R4o%3D&amp;reserve
>> d=0
>>>>
>>>
>>> Hi Ferruh,
>>>
>>> Could you help to squash my fix patch into the patch series? As I've verified
>> it works locally.
>>>
>>> Hi Assli,
>>>
>>> Could you help to check after Ferruh squashed the fix patch into the next-
>> net?
>>>
>>
>> Hi Assli,
>>
>> The other-way around is safer perhaps, can you please test with the patch:
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatc
>> hes.dpdk.org%2Fpatch%2F86804%2F&amp;data=04%7C01%7Codia%40nvidia
>> .com%7C4b212e1ae3d045b6264108d8bbbd7703%7C43083d15727340c1b7db3
>> 9efd9ccc17a%7C0%7C0%7C637465769725939943%7CUnknown%7CTWFpbGZs
>> b3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn
>> 0%3D%7C1000&amp;sdata=ZQU7m8zNXw%2BKxeG6AcveYp9Dpa4k%2BdeO
>> 2hK5DiBT3lg%3D&amp;reserved=0
>>
>> If it is good, I can squash it in the next-net.
>>
>> Thanks,
>> ferruh
> 
> Hi Guys,
> 
> I tested it in my side and it works

Thanks Assli, Leyi, I will proceed with the patch.

> Can you please the Bugzilla bug 619?
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-18 16:36             ` Ferruh Yigit
@ 2021-01-19 13:46               ` Ali Alnubani
  2021-01-20  6:25                 ` Tal Shnaiderman
  0 siblings, 1 reply; 42+ messages in thread
From: Ali Alnubani @ 2021-01-19 13:46 UTC (permalink / raw)
  To: Ferruh Yigit, Odi Assli, Rong,  Leyi, Zhang, Qi Z, Lu, Wenzhuo,
	Richardson, Bruce, Xing, Beilei, Kadam,  Pallavi, Menon, Ranjit
  Cc: dev, Tal Shnaiderman, NBU-Contact-Thomas Monjalon, Raslan Darawsheh

Hi,

> > Hi Guys,
> >
> > I tested it in my side and it works
> 
> Thanks Assli, Leyi, I will proceed with the patch.
> 
> > Can you please the Bugzilla bug 619?
> >

The patch "add AVX512 vector path" also caused a build failure when cross compiling on Linux using mingw, and it's still reproducing in next-net (517969c95).

```
$ meson --werror --buildtype=debugoptimized --cross-file config/x86/cross-mingw -Dexamples=helloworld build  && ninja-build -C build -j32

[221/232] Generating symbol file lib/librte_mbuf-21.dll.p/librte_mbuf-21.dll.symbols
[222/232] Generating symbol file lib/librte_hash-21.dll.p/librte_hash-21.dll.symbols
[223/232] Linking target lib/librte_net-21.dll
[224/232] Compiling C object drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
FAILED: drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
...
drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj.d -o drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj -c ../../root/dpdk/drivers/net/i40e/i40e_rxtx_vec_avx512.c
{standard input}: Assembler messages:
{standard input}:112: Error: invalid register for .seh_savexmm
{standard input}:114: Error: invalid register for .seh_savexmm
...
...
{standard input}:25351: Error: invalid register for .seh_savexmm
{standard input}:25352: Error: invalid register for .seh_savexmm
[225/232] Generating symbol file lib/librte_net-21.dll.p/librte_net-21.dll.symbols
ninja: build stopped: subcommand failed.
```

OS: Fedora 32
Meson: 0.55.3
MinGW: Fedora MinGW 9.2.1-6.fc32

- Ali

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-19 13:46               ` Ali Alnubani
@ 2021-01-20  6:25                 ` Tal Shnaiderman
  2021-01-20  8:36                   ` David Marchand
  0 siblings, 1 reply; 42+ messages in thread
From: Tal Shnaiderman @ 2021-01-20  6:25 UTC (permalink / raw)
  To: Ali Alnubani, Ferruh Yigit, Odi Assli, Rong, Leyi, Zhang, Qi Z,
	Lu, Wenzhuo, Richardson,  Bruce, Xing, Beilei, Kadam, Pallavi,
	Menon, Ranjit
  Cc: dev, NBU-Contact-Thomas Monjalon, Raslan Darawsheh

> Subject: RE: [PATCH v3 0/3] AVX512 vPMD on i40e
> 
> Hi,
> 
> > > Hi Guys,
> > >
> > > I tested it in my side and it works
> >
> > Thanks Assli, Leyi, I will proceed with the patch.
> >
> > > Can you please the Bugzilla bug 619?
> > >
> 
> The patch "add AVX512 vector path" also caused a build failure when cross
> compiling on Linux using mingw, and it's still reproducing in next-net
> (517969c95).
> 
> ```
> $ meson --werror --buildtype=debugoptimized --cross-file config/x86/cross-
> mingw -Dexamples=helloworld build  && ninja-build -C build -j32
> 
> [221/232] Generating symbol file lib/librte_mbuf-21.dll.p/librte_mbuf-
> 21.dll.symbols
> [222/232] Generating symbol file lib/librte_hash-21.dll.p/librte_hash-
> 21.dll.symbols
> [223/232] Linking target lib/librte_net-21.dll [224/232] Compiling C object
> drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> FAILED: drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> ...
> drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj.d -o
> drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj -c
> ../../root/dpdk/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> {standard input}: Assembler messages:
> {standard input}:112: Error: invalid register for .seh_savexmm {standard
> input}:114: Error: invalid register for .seh_savexmm ...
> ...
> {standard input}:25351: Error: invalid register for .seh_savexmm {standard
> input}:25352: Error: invalid register for .seh_savexmm [225/232] Generating
> symbol file lib/librte_net-21.dll.p/librte_net-21.dll.symbols
> ninja: build stopped: subcommand failed.
> ```
> 
> OS: Fedora 32
> Meson: 0.55.3
> MinGW: Fedora MinGW 9.2.1-6.fc32
> 
> - Ali

Those errors were detected in the CI tests for this patch [1], we should pay more attention to those now that 2 PMDs are supported on Windows.

[1]: https://lab.dpdk.org/results/dashboard/patchsets/15164/


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20  6:25                 ` Tal Shnaiderman
@ 2021-01-20  8:36                   ` David Marchand
  2021-01-20  9:18                     ` Ferruh Yigit
  2021-01-20  9:23                     ` Thomas Monjalon
  0 siblings, 2 replies; 42+ messages in thread
From: David Marchand @ 2021-01-20  8:36 UTC (permalink / raw)
  To: Tal Shnaiderman
  Cc: Ali Alnubani, Ferruh Yigit, Odi Assli, Rong, Leyi, Zhang, Qi Z,
	Lu, Wenzhuo, Richardson, Bruce, Xing, Beilei, Kadam, Pallavi,
	Menon, Ranjit, dev, NBU-Contact-Thomas Monjalon,
	Raslan Darawsheh

On Wed, Jan 20, 2021 at 7:26 AM Tal Shnaiderman <talshn@nvidia.com> wrote:
> > The patch "add AVX512 vector path" also caused a build failure when cross
> > compiling on Linux using mingw, and it's still reproducing in next-net
> > (517969c95).
> >
> > ```
> > $ meson --werror --buildtype=debugoptimized --cross-file config/x86/cross-
> > mingw -Dexamples=helloworld build  && ninja-build -C build -j32
> >
> > [221/232] Generating symbol file lib/librte_mbuf-21.dll.p/librte_mbuf-
> > 21.dll.symbols
> > [222/232] Generating symbol file lib/librte_hash-21.dll.p/librte_hash-
> > 21.dll.symbols
> > [223/232] Linking target lib/librte_net-21.dll [224/232] Compiling C object
> > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> > FAILED: drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> > ...
> > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj.d -o
> > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj -c
> > ../../root/dpdk/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> > {standard input}: Assembler messages:
> > {standard input}:112: Error: invalid register for .seh_savexmm {standard
> > input}:114: Error: invalid register for .seh_savexmm ...
> > ...
> > {standard input}:25351: Error: invalid register for .seh_savexmm {standard
> > input}:25352: Error: invalid register for .seh_savexmm [225/232] Generating
> > symbol file lib/librte_net-21.dll.p/librte_net-21.dll.symbols
> > ninja: build stopped: subcommand failed.
> > ```
> >
> > OS: Fedora 32
> > Meson: 0.55.3
> > MinGW: Fedora MinGW 9.2.1-6.fc32
> >
> > - Ali
>
> Those errors were detected in the CI tests for this patch [1], we should pay more attention to those now that 2 PMDs are supported on Windows.
>
> [1]: https://lab.dpdk.org/results/dashboard/patchsets/15164/

I won't grmbl about how CI reports are not being looked at (or maybe I
just did :)).

This simple patch seems to work for me:

diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index c0acdf4fd4..c9a1a50407 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -54,7 +54,7 @@ if arch_subdir == 'x86'
                cc.has_argument('-mavx512f') and
                cc.has_argument('-mavx512bw'))

-       if i40e_avx512_cpu_support == true or i40e_avx512_cc_support == true
+       if not is_windows and (i40e_avx512_cpu_support == true or
i40e_avx512_cc_support == true)
                cflags += ['-DCC_AVX512_SUPPORT']
                avx512_args = [cflags, '-mavx512f', '-mavx512bw']
                if cc.has_argument('-march=skylake-avx512')


If nobody has a better fix, I'll send it later.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20  8:36                   ` David Marchand
@ 2021-01-20  9:18                     ` Ferruh Yigit
  2021-01-20  9:23                     ` Thomas Monjalon
  1 sibling, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-20  9:18 UTC (permalink / raw)
  To: David Marchand, Tal Shnaiderman
  Cc: Ali Alnubani, Odi Assli, Rong, Leyi, Zhang, Qi Z, Lu, Wenzhuo,
	Richardson, Bruce, Xing, Beilei, Kadam, Pallavi, Menon, Ranjit,
	dev, NBU-Contact-Thomas Monjalon, Raslan Darawsheh

On 1/20/2021 8:36 AM, David Marchand wrote:
> On Wed, Jan 20, 2021 at 7:26 AM Tal Shnaiderman <talshn@nvidia.com> wrote:
>>> The patch "add AVX512 vector path" also caused a build failure when cross
>>> compiling on Linux using mingw, and it's still reproducing in next-net
>>> (517969c95).
>>>
>>> ```
>>> $ meson --werror --buildtype=debugoptimized --cross-file config/x86/cross-
>>> mingw -Dexamples=helloworld build  && ninja-build -C build -j32
>>>
>>> [221/232] Generating symbol file lib/librte_mbuf-21.dll.p/librte_mbuf-
>>> 21.dll.symbols
>>> [222/232] Generating symbol file lib/librte_hash-21.dll.p/librte_hash-
>>> 21.dll.symbols
>>> [223/232] Linking target lib/librte_net-21.dll [224/232] Compiling C object
>>> drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
>>> FAILED: drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
>>> ...
>>> drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj.d -o
>>> drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj -c
>>> ../../root/dpdk/drivers/net/i40e/i40e_rxtx_vec_avx512.c
>>> {standard input}: Assembler messages:
>>> {standard input}:112: Error: invalid register for .seh_savexmm {standard
>>> input}:114: Error: invalid register for .seh_savexmm ...
>>> ...
>>> {standard input}:25351: Error: invalid register for .seh_savexmm {standard
>>> input}:25352: Error: invalid register for .seh_savexmm [225/232] Generating
>>> symbol file lib/librte_net-21.dll.p/librte_net-21.dll.symbols
>>> ninja: build stopped: subcommand failed.
>>> ```
>>>
>>> OS: Fedora 32
>>> Meson: 0.55.3
>>> MinGW: Fedora MinGW 9.2.1-6.fc32
>>>
>>> - Ali
>>
>> Those errors were detected in the CI tests for this patch [1], we should pay more attention to those now that 2 PMDs are supported on Windows.
>>
>> [1]: https://lab.dpdk.org/results/dashboard/patchsets/15164/
> 
> I won't grmbl about how CI reports are not being looked at (or maybe I
> just did :)).
> 

Noted and agreed, I will be more careful next time for next-net.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20  8:36                   ` David Marchand
  2021-01-20  9:18                     ` Ferruh Yigit
@ 2021-01-20  9:23                     ` Thomas Monjalon
  2021-01-20  9:53                       ` David Marchand
  1 sibling, 1 reply; 42+ messages in thread
From: Thomas Monjalon @ 2021-01-20  9:23 UTC (permalink / raw)
  To: Tal Shnaiderman, David Marchand
  Cc: Ali Alnubani, Ferruh Yigit, Odi Assli, Rong, Leyi, Zhang, Qi Z,
	Lu, Wenzhuo, Richardson, Bruce, Xing, Beilei, Kadam, Pallavi,
	Menon, Ranjit, dev, Raslan Darawsheh

20/01/2021 09:36, David Marchand:
> On Wed, Jan 20, 2021 at 7:26 AM Tal Shnaiderman <talshn@nvidia.com> wrote:
> > > The patch "add AVX512 vector path" also caused a build failure when cross
> > > compiling on Linux using mingw, and it's still reproducing in next-net
> > > (517969c95).
> > >
> > > ```
> > > $ meson --werror --buildtype=debugoptimized --cross-file config/x86/cross-
> > > mingw -Dexamples=helloworld build  && ninja-build -C build -j32
> > >
> > > [221/232] Generating symbol file lib/librte_mbuf-21.dll.p/librte_mbuf-
> > > 21.dll.symbols
> > > [222/232] Generating symbol file lib/librte_hash-21.dll.p/librte_hash-
> > > 21.dll.symbols
> > > [223/232] Linking target lib/librte_net-21.dll [224/232] Compiling C object
> > > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> > > FAILED: drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> > > ...
> > > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj.d -o
> > > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj -c
> > > ../../root/dpdk/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> > > {standard input}: Assembler messages:
> > > {standard input}:112: Error: invalid register for .seh_savexmm {standard
> > > input}:114: Error: invalid register for .seh_savexmm ...
> > > ...
> > > {standard input}:25351: Error: invalid register for .seh_savexmm {standard
> > > input}:25352: Error: invalid register for .seh_savexmm [225/232] Generating
> > > symbol file lib/librte_net-21.dll.p/librte_net-21.dll.symbols
> > > ninja: build stopped: subcommand failed.
> > > ```
> > >
> > > OS: Fedora 32
> > > Meson: 0.55.3
> > > MinGW: Fedora MinGW 9.2.1-6.fc32
> > >
> > > - Ali
> >
> > Those errors were detected in the CI tests for this patch [1], we should pay more attention to those now that 2 PMDs are supported on Windows.
> >
> > [1]: https://lab.dpdk.org/results/dashboard/patchsets/15164/
> 
> I won't grmbl about how CI reports are not being looked at (or maybe I
> just did :)).
> 
> This simple patch seems to work for me:
> 
> diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
> index c0acdf4fd4..c9a1a50407 100644
> --- a/drivers/net/i40e/meson.build
> +++ b/drivers/net/i40e/meson.build
> @@ -54,7 +54,7 @@ if arch_subdir == 'x86'
>                 cc.has_argument('-mavx512f') and
>                 cc.has_argument('-mavx512bw'))
> 
> -       if i40e_avx512_cpu_support == true or i40e_avx512_cc_support == true
> +       if not is_windows and (i40e_avx512_cpu_support == true or
> i40e_avx512_cc_support == true)
>                 cflags += ['-DCC_AVX512_SUPPORT']
>                 avx512_args = [cflags, '-mavx512f', '-mavx512bw']
>                 if cc.has_argument('-march=skylake-avx512')
> 
> 
> If nobody has a better fix, I'll send it later.


For info, I don't reproduce the compilation issue on my machine.




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20  9:23                     ` Thomas Monjalon
@ 2021-01-20  9:53                       ` David Marchand
  2021-01-20 10:05                         ` Ali Alnubani
  0 siblings, 1 reply; 42+ messages in thread
From: David Marchand @ 2021-01-20  9:53 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Tal Shnaiderman, Ali Alnubani, Ferruh Yigit, Odi Assli, Rong,
	Leyi, Zhang, Qi Z, Lu, Wenzhuo, Richardson, Bruce, Xing, Beilei,
	Kadam, Pallavi, Menon, Ranjit, dev, Raslan Darawsheh

On Wed, Jan 20, 2021 at 10:23 AM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 20/01/2021 09:36, David Marchand:
> > On Wed, Jan 20, 2021 at 7:26 AM Tal Shnaiderman <talshn@nvidia.com> wrote:
> > > > The patch "add AVX512 vector path" also caused a build failure when cross
> > > > compiling on Linux using mingw, and it's still reproducing in next-net
> > > > (517969c95).
> > > >
> > > > ```
> > > > $ meson --werror --buildtype=debugoptimized --cross-file config/x86/cross-
> > > > mingw -Dexamples=helloworld build  && ninja-build -C build -j32
> > > >
> > > > [221/232] Generating symbol file lib/librte_mbuf-21.dll.p/librte_mbuf-
> > > > 21.dll.symbols
> > > > [222/232] Generating symbol file lib/librte_hash-21.dll.p/librte_hash-
> > > > 21.dll.symbols
> > > > [223/232] Linking target lib/librte_net-21.dll [224/232] Compiling C object
> > > > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> > > > FAILED: drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj
> > > > ...
> > > > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj.d -o
> > > > drivers/net/i40e/libi40e_avx512_lib.a.p/i40e_rxtx_vec_avx512.c.obj -c
> > > > ../../root/dpdk/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> > > > {standard input}: Assembler messages:
> > > > {standard input}:112: Error: invalid register for .seh_savexmm {standard
> > > > input}:114: Error: invalid register for .seh_savexmm ...
> > > > ...
> > > > {standard input}:25351: Error: invalid register for .seh_savexmm {standard
> > > > input}:25352: Error: invalid register for .seh_savexmm [225/232] Generating
> > > > symbol file lib/librte_net-21.dll.p/librte_net-21.dll.symbols
> > > > ninja: build stopped: subcommand failed.
> > > > ```
> > > >
> > > > OS: Fedora 32
> > > > Meson: 0.55.3
> > > > MinGW: Fedora MinGW 9.2.1-6.fc32
> > > >
> > > > - Ali
> > >
> > > Those errors were detected in the CI tests for this patch [1], we should pay more attention to those now that 2 PMDs are supported on Windows.
> > >
> > > [1]: https://lab.dpdk.org/results/dashboard/patchsets/15164/
> >
> > I won't grmbl about how CI reports are not being looked at (or maybe I
> > just did :)).
> >
> > This simple patch seems to work for me:
> >
> > diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
> > index c0acdf4fd4..c9a1a50407 100644
> > --- a/drivers/net/i40e/meson.build
> > +++ b/drivers/net/i40e/meson.build
> > @@ -54,7 +54,7 @@ if arch_subdir == 'x86'
> >                 cc.has_argument('-mavx512f') and
> >                 cc.has_argument('-mavx512bw'))
> >
> > -       if i40e_avx512_cpu_support == true or i40e_avx512_cc_support == true
> > +       if not is_windows and (i40e_avx512_cpu_support == true or
> > i40e_avx512_cc_support == true)
> >                 cflags += ['-DCC_AVX512_SUPPORT']
> >                 avx512_args = [cflags, '-mavx512f', '-mavx512bw']
> >                 if cc.has_argument('-march=skylake-avx512')
> >
> >
> > If nobody has a better fix, I'll send it later.
>
>
> For info, I don't reproduce the compilation issue on my machine.

My build system has been upgraded from fc31 to fc32 so I guess this
has something to do with it.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20  9:53                       ` David Marchand
@ 2021-01-20 10:05                         ` Ali Alnubani
  2021-01-20 17:51                           ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Ali Alnubani @ 2021-01-20 10:05 UTC (permalink / raw)
  To: David Marchand, NBU-Contact-Thomas Monjalon
  Cc: Tal Shnaiderman, Ferruh Yigit, Odi Assli, Rong, Leyi, Zhang,
	Qi Z, Lu, Wenzhuo, Richardson, Bruce, Xing, Beilei, Kadam,
	Pallavi, Menon, Ranjit, dev, Raslan Darawsheh

> > For info, I don't reproduce the compilation issue on my machine.
> 
> My build system has been upgraded from fc31 to fc32 so I guess this has
> something to do with it.
> 

I can reproduce with (Fedora MinGW 9.2.1-6.fc32), but not with (Fedora MinGW 10.2.1-2.fc33).

- Ali

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20 10:05                         ` Ali Alnubani
@ 2021-01-20 17:51                           ` Ferruh Yigit
  2021-01-20 18:04                             ` Ferruh Yigit
  2021-01-21  5:01                             ` Kadam, Pallavi
  0 siblings, 2 replies; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-20 17:51 UTC (permalink / raw)
  To: Ali Alnubani, David Marchand, NBU-Contact-Thomas Monjalon,
	Richardson, Bruce
  Cc: Tal Shnaiderman, Odi Assli, Rong, Leyi, Zhang, Qi Z, Lu, Wenzhuo,
	Xing, Beilei, Kadam, Pallavi, Menon, Ranjit, dev,
	Raslan Darawsheh

On 1/20/2021 10:05 AM, Ali Alnubani wrote:
>>> For info, I don't reproduce the compilation issue on my machine.
>>
>> My build system has been upgraded from fc31 to fc32 so I guess this has
>> something to do with it.
>>
> 
> I can reproduce with (Fedora MinGW 9.2.1-6.fc32), but not with (Fedora MinGW 10.2.1-2.fc33).
> 

First of all, for the patch @Tal provided its link, the windows build and the 
mingw build errors are different. And the windows build error should be already 
fixed by a patch that squashed in next-net (https://patches.dpdk.org/patch/86804/).


And for the mingw, I have same result with Ali, I can reproduce with (Fedora 
MinGW 9.2.1-6.fc32).

But different from the CI, I am getting the error [1] for all following files:
rte_random.c
i40e_rxtx_vec_sse.c
i40e_rxtx_vec_avx512.c
i40e_rxtx_vec_avx2.c
rte_ethdev.c

[1] Error: invalid register for .seh_savexmm


There is a stackoverflow entry for it:
https://stackoverflow.com/questions/43152633/invalid-register-for-seh-savexmm-in-cygwin

If I use '-fno-asynchronous-unwind-tables' as suggested there, the build works fine.

So the problem may not be just 'i40e_rxtx_vec_avx512.c'.


If I change the machine type from 'native' to 'corei7' [2], the build error 
reduced to only 'i40e_rxtx_vec_avx512.c', so the problem seems happens when 
avx512 is supported by CPU, in this case compiler seems has a defect.
And since for 'i40e_rxtx_vec_avx512.c' the '-march=skylake-avx512' explicitly 
set can cause the problem seen in all machines.

[2]
  diff --git a/config/x86/cross-mingw b/config/x86/cross-mingw
  index 4c15a7fa2e..7cee238add 100644
  --- a/config/x86/cross-mingw
  +++ b/config/x86/cross-mingw
  @@ -9,5 +9,5 @@ pkgconfig = 'x86_64-w64-mingw32-pkg-config'
   [host_machine]
   system = 'windows'
   cpu_family = 'x86_64'
  -cpu = 'native'
  +cpu = 'corei7'
   endian = 'little'



@Ranjit, @Pallavi,
Are you building using mingw, and if so are you observing same problem?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20 17:51                           ` Ferruh Yigit
@ 2021-01-20 18:04                             ` Ferruh Yigit
  2021-01-21  5:01                             ` Kadam, Pallavi
  1 sibling, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-20 18:04 UTC (permalink / raw)
  To: Ali Alnubani, David Marchand, NBU-Contact-Thomas Monjalon,
	Richardson, Bruce
  Cc: Tal Shnaiderman, Odi Assli, Rong, Leyi, Zhang, Qi Z, Lu, Wenzhuo,
	Xing, Beilei, Kadam, Pallavi, Menon, Ranjit, dev,
	Raslan Darawsheh

On 1/20/2021 5:51 PM, Ferruh Yigit wrote:
> On 1/20/2021 10:05 AM, Ali Alnubani wrote:
>>>> For info, I don't reproduce the compilation issue on my machine.
>>>
>>> My build system has been upgraded from fc31 to fc32 so I guess this has
>>> something to do with it.
>>>
>>
>> I can reproduce with (Fedora MinGW 9.2.1-6.fc32), but not with (Fedora MinGW 
>> 10.2.1-2.fc33).
>>
> 
> First of all, for the patch @Tal provided its link, the windows build and the 
> mingw build errors are different. And the windows build error should be already 
> fixed by a patch that squashed in next-net (https://patches.dpdk.org/patch/86804/).
> 
> 
> And for the mingw, I have same result with Ali, I can reproduce with (Fedora 
> MinGW 9.2.1-6.fc32).
> 
> But different from the CI, I am getting the error [1] for all following files:
> rte_random.c
> i40e_rxtx_vec_sse.c
> i40e_rxtx_vec_avx512.c
> i40e_rxtx_vec_avx2.c
> rte_ethdev.c
> 
> [1] Error: invalid register for .seh_savexmm
> 

Build log for above errors: https://pastebin.com/jD4jRVzL

> 
> There is a stackoverflow entry for it:
> https://stackoverflow.com/questions/43152633/invalid-register-for-seh-savexmm-in-cygwin 
> 
> 
> If I use '-fno-asynchronous-unwind-tables' as suggested there, the build works 
> fine.
> 
> So the problem may not be just 'i40e_rxtx_vec_avx512.c'.
> 
> 
> If I change the machine type from 'native' to 'corei7' [2], the build error 
> reduced to only 'i40e_rxtx_vec_avx512.c', so the problem seems happens when 
> avx512 is supported by CPU, in this case compiler seems has a defect.
> And since for 'i40e_rxtx_vec_avx512.c' the '-march=skylake-avx512' explicitly 
> set can cause the problem seen in all machines.
> 
> [2]
>   diff --git a/config/x86/cross-mingw b/config/x86/cross-mingw
>   index 4c15a7fa2e..7cee238add 100644
>   --- a/config/x86/cross-mingw
>   +++ b/config/x86/cross-mingw
>   @@ -9,5 +9,5 @@ pkgconfig = 'x86_64-w64-mingw32-pkg-config'
>    [host_machine]
>    system = 'windows'
>    cpu_family = 'x86_64'
>   -cpu = 'native'
>   +cpu = 'corei7'
>    endian = 'little'
> 
> 
> 
> @Ranjit, @Pallavi,
> Are you building using mingw, and if so are you observing same problem?


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-20 17:51                           ` Ferruh Yigit
  2021-01-20 18:04                             ` Ferruh Yigit
@ 2021-01-21  5:01                             ` Kadam, Pallavi
  2021-01-25 14:35                               ` David Marchand
  1 sibling, 1 reply; 42+ messages in thread
From: Kadam, Pallavi @ 2021-01-21  5:01 UTC (permalink / raw)
  To: Ferruh Yigit, Ali Alnubani, David Marchand,
	NBU-Contact-Thomas Monjalon, Richardson, Bruce
  Cc: Tal Shnaiderman, Odi Assli, Rong, Leyi, Zhang, Qi Z, Lu, Wenzhuo,
	Xing, Beilei, Menon, Ranjit, dev, Raslan Darawsheh


On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
> On 1/20/2021 10:05 AM, Ali Alnubani wrote:
>>>> For info, I don't reproduce the compilation issue on my machine.
>>>
>>> My build system has been upgraded from fc31 to fc32 so I guess this has
>>> something to do with it.
>>>
>>
>> I can reproduce with (Fedora MinGW 9.2.1-6.fc32), but not with 
>> (Fedora MinGW 10.2.1-2.fc33).
>>
>
> First of all, for the patch @Tal provided its link, the windows build 
> and the mingw build errors are different. And the windows build error 
> should be already fixed by a patch that squashed in next-net 
> (https://patches.dpdk.org/patch/86804/).
>
>
> And for the mingw, I have same result with Ali, I can reproduce with 
> (Fedora MinGW 9.2.1-6.fc32).
>
> But different from the CI, I am getting the error [1] for all 
> following files:
> rte_random.c
> i40e_rxtx_vec_sse.c
> i40e_rxtx_vec_avx512.c
> i40e_rxtx_vec_avx2.c
> rte_ethdev.c
>
> [1] Error: invalid register for .seh_savexmm
>
>
> There is a stackoverflow entry for it:
> https://stackoverflow.com/questions/43152633/invalid-register-for-seh-savexmm-in-cygwin 
>
>
> If I use '-fno-asynchronous-unwind-tables' as suggested there, the 
> build works fine.
>
> So the problem may not be just 'i40e_rxtx_vec_avx512.c'.
>
>
> If I change the machine type from 'native' to 'corei7' [2], the build 
> error reduced to only 'i40e_rxtx_vec_avx512.c', so the problem seems 
> happens when avx512 is supported by CPU, in this case compiler seems 
> has a defect.
> And since for 'i40e_rxtx_vec_avx512.c' the '-march=skylake-avx512' 
> explicitly set can cause the problem seen in all machines.
>
> [2]
>  diff --git a/config/x86/cross-mingw b/config/x86/cross-mingw
>  index 4c15a7fa2e..7cee238add 100644
>  --- a/config/x86/cross-mingw
>  +++ b/config/x86/cross-mingw
>  @@ -9,5 +9,5 @@ pkgconfig = 'x86_64-w64-mingw32-pkg-config'
>   [host_machine]
>   system = 'windows'
>   cpu_family = 'x86_64'
>  -cpu = 'native'
>  +cpu = 'corei7'
>   endian = 'little'
>
>
>
> @Ranjit, @Pallavi,
> Are you building using mingw, and if so are you observing same problem?

We usually build using Clang. However, we verify with mingw as well before submitting the patch.
As mentioned in the patch [1] comments, we replaced #include x86intrin.h with <rte_vect.h> in the file i40e_rxtx_vec_avx2.c
And this helped fixing an error related to conflicting types for '__m_prefethw' with Clang on Windows.
I was able to build this patch using Clang as well as mingw.
[1] http://patches.dpdk.org/patch/84770/

I verified patch fix submitted by Rong, Leyi, it builds successfully with Clang.
However, I am getting same error "Error: invalid register for .seh_savexmm" using mingw for 'i40e_rxtx_vec_avx512.c' file.

Thanks,
Pallavi


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-21  5:01                             ` Kadam, Pallavi
@ 2021-01-25 14:35                               ` David Marchand
  2021-01-26 16:17                                 ` Rong, Leyi
  0 siblings, 1 reply; 42+ messages in thread
From: David Marchand @ 2021-01-25 14:35 UTC (permalink / raw)
  To: Kadam, Pallavi
  Cc: Ferruh Yigit, Ali Alnubani, NBU-Contact-Thomas Monjalon,
	Richardson, Bruce, Tal Shnaiderman, Odi Assli, Rong, Leyi, Zhang,
	Qi Z, Lu, Wenzhuo, Xing, Beilei, Menon, Ranjit, dev,
	Raslan Darawsheh

On Thu, Jan 21, 2021 at 6:02 AM Kadam, Pallavi <pallavi.kadam@intel.com> wrote:
> On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
>
> And for the mingw, I have same result with Ali, I can reproduce with (Fedora MinGW 9.2.1-6.fc32).
>
> But different from the CI, I am getting the error [1] for all following files:
> rte_random.c
> i40e_rxtx_vec_sse.c
> i40e_rxtx_vec_avx512.c
> i40e_rxtx_vec_avx2.c
> rte_ethdev.c
>
> [1] Error: invalid register for .seh_savexmm
>
>
> There is a stackoverflow entry for it:
> https://stackoverflow.com/questions/43152633/invalid-register-for-seh-savexmm-in-cygwin
>
> If I use '-fno-asynchronous-unwind-tables' as suggested there, the build works fine.
>
> So the problem may not be just 'i40e_rxtx_vec_avx512.c'.
>
>
> If I change the machine type from 'native' to 'corei7' [2], the build error reduced to only 'i40e_rxtx_vec_avx512.c', so the problem seems happens when avx512 is supported by CPU, in this case compiler seems has a defect.
> And since for 'i40e_rxtx_vec_avx512.c' the '-march=skylake-avx512' explicitly set can cause the problem seen in all machines.
>
> [2]
>  diff --git a/config/x86/cross-mingw b/config/x86/cross-mingw
>  index 4c15a7fa2e..7cee238add 100644
>  --- a/config/x86/cross-mingw
>  +++ b/config/x86/cross-mingw
>  @@ -9,5 +9,5 @@ pkgconfig = 'x86_64-w64-mingw32-pkg-config'
>   [host_machine]
>   system = 'windows'
>   cpu_family = 'x86_64'
>  -cpu = 'native'
>  +cpu = 'corei7'
>   endian = 'little'
>
> @Ranjit, @Pallavi,
> Are you building using mingw, and if so are you observing same problem?
>

Thanks Ferruh.


> We usually build using Clang. However, we verify with mingw as well before submitting the patch.
> As mentioned in the patch [1] comments, we replaced #include x86intrin.h with <rte_vect.h> in the file i40e_rxtx_vec_avx2.c
> And this helped fixing an error related to conflicting types for '__m_prefethw' with Clang on Windows.
> I was able to build this patch using Clang as well as mingw.
> [1] http://patches.dpdk.org/patch/84770/
>
> I verified patch fix submitted by Rong, Leyi, it builds successfully with Clang.
> However, I am getting same error "Error: invalid register for .seh_savexmm" using mingw for 'i40e_rxtx_vec_avx512.c' file.

The patch I sent https://patchwork.dpdk.org/patch/86999/ is not enough.
I have neither time nor environment to find a fix.

For now I simply stopped checking mingw builds.

Will it get fixed?


-- 
David Marchand


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-25 14:35                               ` David Marchand
@ 2021-01-26 16:17                                 ` Rong, Leyi
  2021-01-26 16:22                                   ` Thomas Monjalon
  0 siblings, 1 reply; 42+ messages in thread
From: Rong, Leyi @ 2021-01-26 16:17 UTC (permalink / raw)
  To: David Marchand, Kadam, Pallavi
  Cc: Yigit, Ferruh, Ali Alnubani, NBU-Contact-Thomas Monjalon,
	Richardson, Bruce, Tal Shnaiderman, Odi Assli, Zhang, Qi Z, Lu,
	Wenzhuo, Xing, Beilei, Menon, Ranjit, dev, Raslan Darawsheh


> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Monday, January 25, 2021 10:35 PM
> To: Kadam, Pallavi <pallavi.kadam@intel.com>
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; Ali Alnubani <alialnu@nvidia.com>;
> NBU-Contact-Thomas Monjalon <thomas@monjalon.net>; Richardson, Bruce
> <bruce.richardson@intel.com>; Tal Shnaiderman <talshn@nvidia.com>; Odi Assli
> <odia@nvidia.com>; Rong, Leyi <leyi.rong@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Menon, Ranjit <ranjit.menon@intel.com>;
> dev@dpdk.org; Raslan Darawsheh <rasland@nvidia.com>
> Subject: Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
> 
> On Thu, Jan 21, 2021 at 6:02 AM Kadam, Pallavi <pallavi.kadam@intel.com>
> wrote:
> > On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
> >
> > And for the mingw, I have same result with Ali, I can reproduce with (Fedora
> MinGW 9.2.1-6.fc32).
> >
> > But different from the CI, I am getting the error [1] for all following files:
> > rte_random.c
> > i40e_rxtx_vec_sse.c
> > i40e_rxtx_vec_avx512.c
> > i40e_rxtx_vec_avx2.c
> > rte_ethdev.c
> >
> > [1] Error: invalid register for .seh_savexmm
> >
> >
> > There is a stackoverflow entry for it:
> > https://stackoverflow.com/questions/43152633/invalid-register-for-seh-
> > savexmm-in-cygwin
> >
> > If I use '-fno-asynchronous-unwind-tables' as suggested there, the build works
> fine.
> >
> > So the problem may not be just 'i40e_rxtx_vec_avx512.c'.
> >
> >
> > If I change the machine type from 'native' to 'corei7' [2], the build error
> reduced to only 'i40e_rxtx_vec_avx512.c', so the problem seems happens when
> avx512 is supported by CPU, in this case compiler seems has a defect.
> > And since for 'i40e_rxtx_vec_avx512.c' the '-march=skylake-avx512' explicitly
> set can cause the problem seen in all machines.
> >
> > [2]
> >  diff --git a/config/x86/cross-mingw b/config/x86/cross-mingw  index
> > 4c15a7fa2e..7cee238add 100644
> >  --- a/config/x86/cross-mingw
> >  +++ b/config/x86/cross-mingw
> >  @@ -9,5 +9,5 @@ pkgconfig = 'x86_64-w64-mingw32-pkg-config'
> >   [host_machine]
> >   system = 'windows'
> >   cpu_family = 'x86_64'
> >  -cpu = 'native'
> >  +cpu = 'corei7'
> >   endian = 'little'
> >
> > @Ranjit, @Pallavi,
> > Are you building using mingw, and if so are you observing same problem?
> >
> 
> Thanks Ferruh.
> 
> 
> > We usually build using Clang. However, we verify with mingw as well before
> submitting the patch.
> > As mentioned in the patch [1] comments, we replaced #include
> > x86intrin.h with <rte_vect.h> in the file i40e_rxtx_vec_avx2.c And this helped
> fixing an error related to conflicting types for '__m_prefethw' with Clang on
> Windows.
> > I was able to build this patch using Clang as well as mingw.
> > [1] http://patches.dpdk.org/patch/84770/
> >
> > I verified patch fix submitted by Rong, Leyi, it builds successfully with Clang.
> > However, I am getting same error "Error: invalid register for .seh_savexmm"
> using mingw for 'i40e_rxtx_vec_avx512.c' file.
> 
> The patch I sent https://patchwork.dpdk.org/patch/86999/ is not enough.
> I have neither time nor environment to find a fix.
> 
> For now I simply stopped checking mingw builds.
> 
> Will it get fixed?
> 
> 
> --
> David Marchand

Hi,

Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-26 16:17                                 ` Rong, Leyi
@ 2021-01-26 16:22                                   ` Thomas Monjalon
  2021-01-26 16:39                                     ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Thomas Monjalon @ 2021-01-26 16:22 UTC (permalink / raw)
  To: Rong, Leyi
  Cc: David Marchand, Kadam, Pallavi, dev, Yigit, Ferruh, Ali Alnubani,
	Richardson, Bruce, Tal Shnaiderman, Odi Assli, Zhang, Qi Z, Lu,
	Wenzhuo, Xing, Beilei, Menon, Ranjit, dev, Raslan Darawsheh

26/01/2021 17:17, Rong, Leyi:
> > > On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
> > >
> > > And for the mingw, I have same result with Ali, I can reproduce with (Fedora
> > MinGW 9.2.1-6.fc32).
> > >
> > > But different from the CI, I am getting the error [1] for all following files:
> > > rte_random.c
> > > i40e_rxtx_vec_sse.c
> > > i40e_rxtx_vec_avx512.c
> > > i40e_rxtx_vec_avx2.c
> > > rte_ethdev.c
[...]
> Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
> Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.

Ferruh mentioned errors with other files outside of i40e.



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-26 16:22                                   ` Thomas Monjalon
@ 2021-01-26 16:39                                     ` Ferruh Yigit
  2021-01-26 16:48                                       ` Thomas Monjalon
  0 siblings, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-26 16:39 UTC (permalink / raw)
  To: Thomas Monjalon, Rong, Leyi
  Cc: David Marchand, Kadam, Pallavi, dev, Ali Alnubani, Richardson,
	Bruce, Tal Shnaiderman, Odi Assli, Zhang, Qi Z, Lu, Wenzhuo,
	Xing, Beilei, Menon, Ranjit, Raslan Darawsheh

On 1/26/2021 4:22 PM, Thomas Monjalon wrote:
> 26/01/2021 17:17, Rong, Leyi:
>>>> On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
>>>>
>>>> And for the mingw, I have same result with Ali, I can reproduce with (Fedora
>>> MinGW 9.2.1-6.fc32).
>>>>
>>>> But different from the CI, I am getting the error [1] for all following files:
>>>> rte_random.c
>>>> i40e_rxtx_vec_sse.c
>>>> i40e_rxtx_vec_avx512.c
>>>> i40e_rxtx_vec_avx2.c
>>>> rte_ethdev.c
> [...]
>> Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
>> Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.
> 
> Ferruh mentioned errors with other files outside of i40e.
> 

The patch globally disables the avx512 for windows cross build, not just for 
i40e, so I confirm it fixes the build errors in my environment.

But disabling avx512 like this is not best option.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-26 16:39                                     ` Ferruh Yigit
@ 2021-01-26 16:48                                       ` Thomas Monjalon
  2021-01-26 16:51                                         ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Thomas Monjalon @ 2021-01-26 16:48 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Rong, Leyi, David Marchand, Kadam, Pallavi, dev, Ali Alnubani,
	Richardson, Bruce, Tal Shnaiderman, Odi Assli, Zhang, Qi Z, Lu,
	Wenzhuo, Xing, Beilei, Menon, Ranjit, Raslan Darawsheh

26/01/2021 17:39, Ferruh Yigit:
> On 1/26/2021 4:22 PM, Thomas Monjalon wrote:
> > 26/01/2021 17:17, Rong, Leyi:
> >>>> On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
> >>>>
> >>>> And for the mingw, I have same result with Ali, I can reproduce with (Fedora
> >>> MinGW 9.2.1-6.fc32).
> >>>>
> >>>> But different from the CI, I am getting the error [1] for all following files:
> >>>> rte_random.c
> >>>> i40e_rxtx_vec_sse.c
> >>>> i40e_rxtx_vec_avx512.c
> >>>> i40e_rxtx_vec_avx2.c
> >>>> rte_ethdev.c
> > [...]
> >> Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
> >> Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.
> > 
> > Ferruh mentioned errors with other files outside of i40e.
> > 
> 
> The patch globally disables the avx512 for windows cross build, not just for 
> i40e, so I confirm it fixes the build errors in my environment.
> 
> But disabling avx512 like this is not best option.

Some may argue that AVX512 is not the best option ;)



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-26 16:48                                       ` Thomas Monjalon
@ 2021-01-26 16:51                                         ` Ferruh Yigit
  2021-01-28 20:35                                           ` Dmitry Kozlyuk
  0 siblings, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2021-01-26 16:51 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Rong, Leyi, David Marchand, Kadam, Pallavi, dev, Ali Alnubani,
	Richardson, Bruce, Tal Shnaiderman, Odi Assli, Zhang, Qi Z, Lu,
	Wenzhuo, Xing, Beilei, Menon, Ranjit, Raslan Darawsheh

On 1/26/2021 4:48 PM, Thomas Monjalon wrote:
> 26/01/2021 17:39, Ferruh Yigit:
>> On 1/26/2021 4:22 PM, Thomas Monjalon wrote:
>>> 26/01/2021 17:17, Rong, Leyi:
>>>>>> On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
>>>>>>
>>>>>> And for the mingw, I have same result with Ali, I can reproduce with (Fedora
>>>>> MinGW 9.2.1-6.fc32).
>>>>>>
>>>>>> But different from the CI, I am getting the error [1] for all following files:
>>>>>> rte_random.c
>>>>>> i40e_rxtx_vec_sse.c
>>>>>> i40e_rxtx_vec_avx512.c
>>>>>> i40e_rxtx_vec_avx2.c
>>>>>> rte_ethdev.c
>>> [...]
>>>> Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
>>>> Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.
>>>
>>> Ferruh mentioned errors with other files outside of i40e.
>>>
>>
>> The patch globally disables the avx512 for windows cross build, not just for
>> i40e, so I confirm it fixes the build errors in my environment.
>>
>> But disabling avx512 like this is not best option.
> 
> Some may argue that AVX512 is not the best option ;)
> 

flame war mode activated.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-26 16:51                                         ` Ferruh Yigit
@ 2021-01-28 20:35                                           ` Dmitry Kozlyuk
  2021-01-28 21:24                                             ` Thomas Monjalon
  0 siblings, 1 reply; 42+ messages in thread
From: Dmitry Kozlyuk @ 2021-01-28 20:35 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Thomas Monjalon, Rong, Leyi, David Marchand, Kadam, Pallavi, dev,
	Ali Alnubani, Richardson, Bruce, Tal Shnaiderman, Odi Assli,
	Zhang, Qi Z, Lu, Wenzhuo, Xing, Beilei, Menon, Ranjit,
	Raslan Darawsheh

On Tue, 26 Jan 2021 16:51:19 +0000, Ferruh Yigit wrote:
> On 1/26/2021 4:48 PM, Thomas Monjalon wrote:
> > 26/01/2021 17:39, Ferruh Yigit:  
> >> On 1/26/2021 4:22 PM, Thomas Monjalon wrote:  
> >>> 26/01/2021 17:17, Rong, Leyi:  
> >>>>>> On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
> >>>>>>
> >>>>>> And for the mingw, I have same result with Ali, I can reproduce with (Fedora  
> >>>>> MinGW 9.2.1-6.fc32).  
> >>>>>>
> >>>>>> But different from the CI, I am getting the error [1] for all following files:
> >>>>>> rte_random.c
> >>>>>> i40e_rxtx_vec_sse.c
> >>>>>> i40e_rxtx_vec_avx512.c
> >>>>>> i40e_rxtx_vec_avx2.c
> >>>>>> rte_ethdev.c  
> >>> [...]  
> >>>> Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
> >>>> Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.  
> >>>
> >>> Ferruh mentioned errors with other files outside of i40e.
> >>>  
> >>
> >> The patch globally disables the avx512 for windows cross build, not just for
> >> i40e, so I confirm it fixes the build errors in my environment.
> >>
> >> But disabling avx512 like this is not best option.  
> > 
> > Some may argue that AVX512 is not the best option ;)
> >   
> 
> flame war mode activated.

I found the following fix working:

--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -60,6 +60,9 @@ if arch_subdir == 'x86'
 		if cc.has_argument('-march=skylake-avx512')
 			avx512_args += '-march=skylake-avx512'
 		endif
+		if is_windows and cc.get_id() == 'gcc'
+			avx512_args += '-fno-asynchronous-unwind-tables'
+		endif
 		i40e_avx512_lib = static_library('i40e_avx512_lib',
 				'i40e_rxtx_vec_avx512.c',
 				dependencies: [static_rte_ethdev,

It's admittedly from StackOverflow, but after reading about
-fno-asynchronous-unwind-tables I see no harm disabling it, at least for just
one file.



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e
  2021-01-28 20:35                                           ` Dmitry Kozlyuk
@ 2021-01-28 21:24                                             ` Thomas Monjalon
  0 siblings, 0 replies; 42+ messages in thread
From: Thomas Monjalon @ 2021-01-28 21:24 UTC (permalink / raw)
  To: Dmitry Kozlyuk
  Cc: Ferruh Yigit, dev, Rong, Leyi, David Marchand, Kadam, Pallavi,
	dev, Ali Alnubani, Richardson, Bruce, Tal Shnaiderman, Odi Assli,
	Zhang, Qi Z, Lu, Wenzhuo, Xing, Beilei, Menon, Ranjit,
	Raslan Darawsheh

28/01/2021 21:35, Dmitry Kozlyuk:
> On Tue, 26 Jan 2021 16:51:19 +0000, Ferruh Yigit wrote:
> > On 1/26/2021 4:48 PM, Thomas Monjalon wrote:
> > > 26/01/2021 17:39, Ferruh Yigit:  
> > >> On 1/26/2021 4:22 PM, Thomas Monjalon wrote:  
> > >>> 26/01/2021 17:17, Rong, Leyi:  
> > >>>>>> On 1/20/2021 11:21 PM, Ferruh Yigit wrote:
> > >>>>>>
> > >>>>>> And for the mingw, I have same result with Ali, I can reproduce with (Fedora  
> > >>>>> MinGW 9.2.1-6.fc32).  
> > >>>>>>
> > >>>>>> But different from the CI, I am getting the error [1] for all following files:
> > >>>>>> rte_random.c
> > >>>>>> i40e_rxtx_vec_sse.c
> > >>>>>> i40e_rxtx_vec_avx512.c
> > >>>>>> i40e_rxtx_vec_avx2.c
> > >>>>>> rte_ethdev.c  
> > >>> [...]  
> > >>>> Send patch https://patchwork.dpdk.org/patch/87349/ to disable avx512 on windows as the workaround.
> > >>>> Pallavi also tested with this patch, and passed the mingw and clang build. Will try to see if has better way to fix.  
> > >>>
> > >>> Ferruh mentioned errors with other files outside of i40e.
> > >>>  
> > >>
> > >> The patch globally disables the avx512 for windows cross build, not just for
> > >> i40e, so I confirm it fixes the build errors in my environment.
> > >>
> > >> But disabling avx512 like this is not best option.  
> > > 
> > > Some may argue that AVX512 is not the best option ;)
> > >   
> > 
> > flame war mode activated.
> 
> I found the following fix working:
> 
> --- a/drivers/net/i40e/meson.build
> +++ b/drivers/net/i40e/meson.build
> @@ -60,6 +60,9 @@ if arch_subdir == 'x86'
>  		if cc.has_argument('-march=skylake-avx512')
>  			avx512_args += '-march=skylake-avx512'
>  		endif
> +		if is_windows and cc.get_id() == 'gcc'
> +			avx512_args += '-fno-asynchronous-unwind-tables'
> +		endif
>  		i40e_avx512_lib = static_library('i40e_avx512_lib',
>  				'i40e_rxtx_vec_avx512.c',
>  				dependencies: [static_rte_ethdev,
> 
> It's admittedly from StackOverflow, but after reading about
> -fno-asynchronous-unwind-tables I see no harm disabling it, at least for just
> one file.

Thanks, it will require some tests and approvals.
For now, I take the patch proposing a simple disabling of AVX512 with MinGW.



^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2021-01-28 21:24 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-15  2:19 [dpdk-dev] [PATCH 0/3] AVX512 vPMD on i40e Leyi Rong
2020-12-15  2:19 ` [dpdk-dev] [PATCH 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
2020-12-15  2:19 ` [dpdk-dev] [PATCH 2/3] net/i40e: add AVX512 vector path Leyi Rong
2020-12-15  2:19 ` [dpdk-dev] [PATCH 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
2021-01-07  7:44 ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Leyi Rong
2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
2021-01-13  6:12     ` Lu, Wenzhuo
2021-01-13 13:40     ` Ferruh Yigit
2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add AVX512 vector path Leyi Rong
2021-01-13  6:13     ` Lu, Wenzhuo
2021-01-07  7:44   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
2021-01-13  6:12     ` Lu, Wenzhuo
2021-01-13  9:53   ` [dpdk-dev] [PATCH v2 0/3] AVX512 vPMD on i40e Zhang, Qi Z
2021-01-14  6:39 ` [dpdk-dev] [PATCH v3 " Leyi Rong
2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 1/3] net/i40e: remove devarg use-latest-supported-vec Leyi Rong
2021-01-15 13:36     ` Ferruh Yigit
2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 2/3] net/i40e: add AVX512 vector path Leyi Rong
2021-01-14  6:39   ` [dpdk-dev] [PATCH v3 3/3] net/i40e: optimize Tx by using AVX512 Leyi Rong
2021-01-14  7:37   ` [dpdk-dev] [PATCH v3 0/3] AVX512 vPMD on i40e Zhang, Qi Z
2021-01-17 11:26     ` Odi Assli
2021-01-18 13:58       ` Rong, Leyi
2021-01-18 14:24         ` Ferruh Yigit
2021-01-18 14:53           ` Odi Assli
2021-01-18 16:36             ` Ferruh Yigit
2021-01-19 13:46               ` Ali Alnubani
2021-01-20  6:25                 ` Tal Shnaiderman
2021-01-20  8:36                   ` David Marchand
2021-01-20  9:18                     ` Ferruh Yigit
2021-01-20  9:23                     ` Thomas Monjalon
2021-01-20  9:53                       ` David Marchand
2021-01-20 10:05                         ` Ali Alnubani
2021-01-20 17:51                           ` Ferruh Yigit
2021-01-20 18:04                             ` Ferruh Yigit
2021-01-21  5:01                             ` Kadam, Pallavi
2021-01-25 14:35                               ` David Marchand
2021-01-26 16:17                                 ` Rong, Leyi
2021-01-26 16:22                                   ` Thomas Monjalon
2021-01-26 16:39                                     ` Ferruh Yigit
2021-01-26 16:48                                       ` Thomas Monjalon
2021-01-26 16:51                                         ` Ferruh Yigit
2021-01-28 20:35                                           ` Dmitry Kozlyuk
2021-01-28 21:24                                             ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).