DPDK patches and discussions
 help / color / Atom feed
* [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs
@ 2020-08-27  7:54 Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 1/4] net/ixgbe: maximize vector rx burst for ixgbe Jeff Guo
                   ` (7 more replies)
  0 siblings, 8 replies; 48+ messages in thread
From: Jeff Guo @ 2020-08-27  7:54 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

This patch set aims to maximize vector rx burst for for
ixgbe/i40e/ice/iavf PMDs.

Jeff Guo (4):
  net/ixgbe: maximize vector rx burst for ixgbe
  net/i40e: maximize vector rx burst for i40e
  net/ice: maximize vector rx burst for ice
  net/iavf: maximize vector rx burst for iavf

 drivers/net/i40e/i40e_rxtx_vec_altivec.c |  60 ++++++++-----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c    |   6 +-
 drivers/net/i40e/i40e_rxtx_vec_neon.c    |  49 +++++++----
 drivers/net/i40e/i40e_rxtx_vec_sse.c     |  51 ++++++++----
 drivers/net/iavf/iavf_rxtx_vec_sse.c     | 102 +++++++++++++++++------
 drivers/net/ice/ice_rxtx_vec_sse.c       |  47 ++++++++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c  |  51 +++++++++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c   |  48 ++++++++---
 8 files changed, 301 insertions(+), 113 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v1 1/4] net/ixgbe: maximize vector rx burst for ixgbe
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
@ 2020-08-27  7:54 ` Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 2/4] net/i40e: maximize vector rx burst for i40e Jeff Guo
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-08-27  7:54 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 51 ++++++++++++++++++++-----
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c  | 48 ++++++++++++++++-------
 2 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index aa27ee177..95e40aa13 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -206,6 +206,13 @@ desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,
 				vgetq_lane_u32(tunnel_check, 3));
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -226,9 +233,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
 				 rxq->crc_len, 0, 0, 0};
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -382,7 +386,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
+/**
  * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
@@ -399,19 +403,17 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets
  *
  * Notice:
  * - don't support ol_flags for rss and csum err
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
@@ -443,6 +445,35 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca3166..96b207f94 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -302,13 +302,11 @@ desc_to_ptype_v(__m128i descs[4], uint16_t pkt_type_mask,
 		get_packet_type(3, pkt_info, etqf_check, tunnel_check);
 }
 
-/*
+/**
  * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
 static inline uint16_t
@@ -344,9 +342,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	__m128i mbuf_init;
 	uint8_t vlan_flags;
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -556,7 +551,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
+/**
  * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
@@ -572,18 +567,16 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
@@ -615,6 +608,35 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v1 2/4] net/i40e: maximize vector rx burst for i40e
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 1/4] net/ixgbe: maximize vector rx burst for ixgbe Jeff Guo
@ 2020-08-27  7:54 ` Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 3/4] net/ice: maximize vector rx burst for ice Jeff Guo
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-08-27  7:54 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/i40e/i40e_rxtx_vec_altivec.c | 60 ++++++++++++++++--------
 drivers/net/i40e/i40e_rxtx_vec_avx2.c    |  6 +--
 drivers/net/i40e/i40e_rxtx_vec_neon.c    | 49 +++++++++++++------
 drivers/net/i40e/i40e_rxtx_vec_sse.c     | 51 ++++++++++++++------
 4 files changed, 115 insertions(+), 51 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_altivec.c b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
index 6862a017e..d26b13c33 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_altivec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
@@ -190,8 +190,6 @@ desc_to_ptype_v(vector unsigned long descs[4], struct rte_mbuf **rx_pkts,
 
  /* Notice:
   * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
   */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -214,9 +212,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		};
 	vector unsigned long dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -447,11 +442,10 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
- /* Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
+/**
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
 uint16_t
 i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -459,15 +453,14 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
-  * Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+/**
+ * vPMD receive routine that reassembles scattered packets
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
@@ -500,6 +493,35 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 	struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef1363..b3d3153f1 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -729,7 +729,7 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/*
+/**
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
  */
@@ -740,7 +740,7 @@ i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
@@ -781,7 +781,7 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets.
  * Main receive routine that can handle arbitrary burst sizes
  * Notice:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 6f874e45b..5cbef7f57 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -190,8 +190,6 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
  /*
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
@@ -230,9 +228,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 		0, 0, 0       /* ignore non-length fields */
 		};
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -426,11 +421,9 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 	return nb_pkts_recd;
 }
 
- /*
+/**
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
 uint16_t
 i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
@@ -439,15 +432,14 @@ i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles scattered packets
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
@@ -482,6 +474,35 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..6a4bd4f45 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -342,11 +342,9 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
 }
 
- /*
+/**
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -378,9 +376,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -592,11 +587,9 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
- /*
+/**
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
 uint16_t
 i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -605,15 +598,14 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles scattered packets
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
@@ -648,6 +640,35 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v1 3/4] net/ice: maximize vector rx burst for ice
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 1/4] net/ixgbe: maximize vector rx burst for ixgbe Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 2/4] net/i40e: maximize vector rx burst for i40e Jeff Guo
@ 2020-08-27  7:54 ` Jeff Guo
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 4/4] net/iavf: maximize vector rx burst for iavf Jeff Guo
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-08-27  7:54 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/ice/ice_rxtx_vec_sse.c | 47 +++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..05f2efa10 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -207,8 +207,6 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 /**
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -264,9 +262,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 
@@ -444,8 +439,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 /**
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
  */
 uint16_t
 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -454,15 +447,14 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles scattered packets
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
  */
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
 {
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
@@ -496,6 +488,35 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 					     &split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     nb_pkts);
+}
+
 static inline void
 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v1 4/4] net/iavf: maximize vector rx burst for iavf
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
                   ` (2 preceding siblings ...)
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 3/4] net/ice: maximize vector rx burst for ice Jeff Guo
@ 2020-08-27  7:54 ` Jeff Guo
  2020-08-27  8:40 ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts Morten Brørup
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-08-27  7:54 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/iavf/iavf_rxtx_vec_sse.c | 102 ++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 26 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 85c5bd4af..a4c97e77d 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -379,10 +379,9 @@ flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)];
 }
 
-/* Notice:
+/**
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -413,9 +412,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -627,10 +623,9 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/* Notice:
+/**
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
  */
 static inline uint16_t
 _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
@@ -688,9 +683,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -921,7 +913,8 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	return nb_pkts_recd;
 }
 
-/* Notice:
+/**
+ * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
  *   numbers of DD bits
@@ -933,7 +926,8 @@ iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* Notice:
+/**
+ * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
  *   numbers of DD bits
@@ -945,15 +939,14 @@ iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles scattered packets
  * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
  */
-uint16_t
-iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
@@ -986,16 +979,44 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/* vPMD receive routine that reassembles scattered packets for flex RxD
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
  * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
  */
 uint16_t
-iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
-				      struct rte_mbuf **rx_pkts,
-				      uint16_t nb_pkts)
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
@@ -1028,6 +1049,35 @@ iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval, nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
                   ` (3 preceding siblings ...)
  2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 4/4] net/iavf: maximize vector rx burst for iavf Jeff Guo
@ 2020-08-27  8:40 ` Morten Brørup
  2020-08-27  9:09   ` Bruce Richardson
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 48+ messages in thread
From: Morten Brørup @ 2020-08-27  8:40 UTC (permalink / raw)
  To: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko
  Cc: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	bruce.richardson, dev, helin.zhang, ferruh.yigit, barbette

Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,

I'm hijacking this patch thread to propose a small API modification that prevents unnecessarily performance degradations.

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> Sent: Thursday, August 27, 2020 9:55 AM
> 
> The limitation of burst size in vector rx was removed, since it should
> retrieve as much received packets as possible. And also the scattered
> receive path should use a wrapper function to achieve the goal of
> burst maximizing.
> 
> This patch set aims to maximize vector rx burst for for
> ixgbe/i40e/ice/iavf PMDs.
> 

Now I'm going to be pedantic and say that it still doesn't conform to the rte_eth_rx_burst() API, because the API does not specify any minimum requirement for nb_pkts.

In theory, that could also be fixed in the driver by calling the non-vector function from the vector functions if nb_pkts is too small for the vector implementation.

However, I think that calling rte_eth_rx_burst() with a small nb_pkts is silly and not in the spirit of DPDK, and introducing an additional comparison for a small nb_pkts in the driver vector functions would degrade their performance (only slightly, but anyway).

Instead, I propose that the rte_eth_rx_burst() API is updated with a minimum requirement for nb_pkts. This minimum requirement should be supported by all Ethernet drivers, instead of having minimum requirements for nb_pkts depending on driver and vector function.


I also have a small comment about the description for all the main rx functions:

+/**
+ * vPMD receive routine that reassembles scattered packets.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */

It says "can handle arbitrary burst sizes", but bears a notice that it really cannot. So please remove that line from all these functions.


Med venlig hilsen / kind regards
- Morten Brørup

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts
  2020-08-27  8:40 ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts Morten Brørup
@ 2020-08-27  9:09   ` Bruce Richardson
  2020-08-27  9:31     ` Morten Brørup
  0 siblings, 1 reply; 48+ messages in thread
From: Bruce Richardson @ 2020-08-27  9:09 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

On Thu, Aug 27, 2020 at 10:40:11AM +0200, Morten Brørup wrote:
> Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,
> 
> I'm hijacking this patch thread to propose a small API modification that prevents unnecessarily performance degradations.
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> > Sent: Thursday, August 27, 2020 9:55 AM
> > 
> > The limitation of burst size in vector rx was removed, since it should
> > retrieve as much received packets as possible. And also the scattered
> > receive path should use a wrapper function to achieve the goal of
> > burst maximizing.
> > 
> > This patch set aims to maximize vector rx burst for for
> > ixgbe/i40e/ice/iavf PMDs.
> > 
> 
> Now I'm going to be pedantic and say that it still doesn't conform to the rte_eth_rx_burst() API, because the API does not specify any minimum requirement for nb_pkts.
> 
> In theory, that could also be fixed in the driver by calling the non-vector function from the vector functions if nb_pkts is too small for the vector implementation.
> 
> However, I think that calling rte_eth_rx_burst() with a small nb_pkts is silly and not in the spirit of DPDK, and introducing an additional comparison for a small nb_pkts in the driver vector functions would degrade their performance (only slightly, but anyway).
> 

Actually, I'd like to see a confirmed measurement showing a slowdown before
we discard such an option. :-) While I agree that using small bursts is not
keeping with the design approach of DPDK of using large bursts to amortize
costs and allow prefetching, there are cases where a user/app may want a
small burst size, e.g. 4, for latency reasons, and we need a way to support
that.

Since the path selection is dynamic, we need to either:
a) provide a way for the user to specify that they will use smaller bursts
and so that vector functions should not be used
b) have the vector functions transparently fallback to the scalar ones if
used with smaller bursts

Of these, option b) is simpler, and should be low cost since any check is
just once per burst, and - assuming an app is written using the same
request-size each time - should be entirely predictable after the first
call.

/Bruce


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts
  2020-08-27  9:09   ` Bruce Richardson
@ 2020-08-27  9:31     ` Morten Brørup
  2020-08-27  9:43       ` Bruce Richardson
  0 siblings, 1 reply; 48+ messages in thread
From: Morten Brørup @ 2020-08-27  9:31 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Thursday, August 27, 2020 11:10 AM
> 
> On Thu, Aug 27, 2020 at 10:40:11AM +0200, Morten Brørup wrote:
> > Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,
> >
> > I'm hijacking this patch thread to propose a small API modification
> that prevents unnecessarily performance degradations.
> >
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> > > Sent: Thursday, August 27, 2020 9:55 AM
> > >
> > > The limitation of burst size in vector rx was removed, since it
> should
> > > retrieve as much received packets as possible. And also the
> scattered
> > > receive path should use a wrapper function to achieve the goal of
> > > burst maximizing.
> > >
> > > This patch set aims to maximize vector rx burst for for
> > > ixgbe/i40e/ice/iavf PMDs.
> > >
> >
> > Now I'm going to be pedantic and say that it still doesn't conform to
> the rte_eth_rx_burst() API, because the API does not specify any
> minimum requirement for nb_pkts.
> >
> > In theory, that could also be fixed in the driver by calling the non-
> vector function from the vector functions if nb_pkts is too small for
> the vector implementation.
> >
> > However, I think that calling rte_eth_rx_burst() with a small nb_pkts
> is silly and not in the spirit of DPDK, and introducing an additional
> comparison for a small nb_pkts in the driver vector functions would
> degrade their performance (only slightly, but anyway).
> >
> 
> Actually, I'd like to see a confirmed measurement showing a slowdown
> before
> we discard such an option. :-)

Good point!

> While I agree that using small bursts is
> not
> keeping with the design approach of DPDK of using large bursts to
> amortize
> costs and allow prefetching, there are cases where a user/app may want
> a
> small burst size, e.g. 4, for latency reasons, and we need a way to
> support
> that.
> 
I assume that calling rte_eth_rx_burst() with nb_pkts=32 returns 4 packets if only 4 packets are available, so you would need to be extremely latency sensitive to call it with a smaller nb_pkts. I guess that high frequency trading is the only real life scenario here.

> Since the path selection is dynamic, we need to either:
> a) provide a way for the user to specify that they will use smaller
> bursts
> and so that vector functions should not be used
> b) have the vector functions transparently fallback to the scalar ones
> if
> used with smaller bursts
> 
> Of these, option b) is simpler, and should be low cost since any check
> is
> just once per burst, and - assuming an app is written using the same
> request-size each time - should be entirely predictable after the first
> call.
> 
Why does everyone assume that DPDK applications are so simple that the branch predictor will cover the entire data path? I hear this argument over and over again, and by principle I disagree with it!

How about c): add rte_eth_rx() and rte_eth_tx() functions for receiving/transmitting a single packet. The ring library has such functions.

Optimized single-packet functions might even perform better than calling the burst functions with nb_pkts=1. Great for latency focused applications. :-)

> /Bruce
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts
  2020-08-27  9:31     ` Morten Brørup
@ 2020-08-27  9:43       ` Bruce Richardson
  2020-08-27 10:13         ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts Morten Brørup
  0 siblings, 1 reply; 48+ messages in thread
From: Bruce Richardson @ 2020-08-27  9:43 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

On Thu, Aug 27, 2020 at 11:31:15AM +0200, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Thursday, August 27, 2020 11:10 AM
> > 
> > On Thu, Aug 27, 2020 at 10:40:11AM +0200, Morten Brørup wrote:
> > > Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,
> > >
> > > I'm hijacking this patch thread to propose a small API modification
> > that prevents unnecessarily performance degradations.
> > >
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> > > > Sent: Thursday, August 27, 2020 9:55 AM
> > > >
> > > > The limitation of burst size in vector rx was removed, since it
> > should
> > > > retrieve as much received packets as possible. And also the
> > scattered
> > > > receive path should use a wrapper function to achieve the goal of
> > > > burst maximizing.
> > > >
> > > > This patch set aims to maximize vector rx burst for for
> > > > ixgbe/i40e/ice/iavf PMDs.
> > > >
> > >
> > > Now I'm going to be pedantic and say that it still doesn't conform to
> > the rte_eth_rx_burst() API, because the API does not specify any
> > minimum requirement for nb_pkts.
> > >
> > > In theory, that could also be fixed in the driver by calling the non-
> > vector function from the vector functions if nb_pkts is too small for
> > the vector implementation.
> > >
> > > However, I think that calling rte_eth_rx_burst() with a small nb_pkts
> > is silly and not in the spirit of DPDK, and introducing an additional
> > comparison for a small nb_pkts in the driver vector functions would
> > degrade their performance (only slightly, but anyway).
> > >
> > 
> > Actually, I'd like to see a confirmed measurement showing a slowdown
> > before
> > we discard such an option. :-)
> 
> Good point!
> 
> > While I agree that using small bursts is
> > not
> > keeping with the design approach of DPDK of using large bursts to
> > amortize
> > costs and allow prefetching, there are cases where a user/app may want
> > a
> > small burst size, e.g. 4, for latency reasons, and we need a way to
> > support
> > that.
> > 
> I assume that calling rte_eth_rx_burst() with nb_pkts=32 returns 4 packets if only 4 packets are available, so you would need to be extremely latency sensitive to call it with a smaller nb_pkts. I guess that high frequency trading is the only real life scenario here.
>
Yes, it really boils down to whether you are prepared to accept lower
max throughput or dropped packets in order to gain lower latency.
 
> > Since the path selection is dynamic, we need to either:
> > a) provide a way for the user to specify that they will use smaller
> > bursts
> > and so that vector functions should not be used
> > b) have the vector functions transparently fallback to the scalar ones
> > if
> > used with smaller bursts
> > 
> > Of these, option b) is simpler, and should be low cost since any check
> > is
> > just once per burst, and - assuming an app is written using the same
> > request-size each time - should be entirely predictable after the first
> > call.
> > 
> Why does everyone assume that DPDK applications are so simple that the branch predictor will cover the entire data path? I hear this argument over and over again, and by principle I disagree with it!
> 

Fair enough, that was an assumption on my part. Do you see in your apps
many cases where branches are getting mispredicted despite going the same
way each time though the code?

> How about c): add rte_eth_rx() and rte_eth_tx() functions for receiving/transmitting a single packet. The ring library has such functions.
> 
> Optimized single-packet functions might even perform better than calling the burst functions with nb_pkts=1. Great for latency focused applications. :-)
>
That is another option, yes.
A further option is to add to the vector code a one-off switch to check first
time it's called that the request size is not lower than the min supported
(again basing on the assumption that one is not going to be varying the
burst size asked - which may not be true in call cases but won't leave us
any worse off than we are now!).

/Bruce

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts
  2020-08-27  9:43       ` Bruce Richardson
@ 2020-08-27 10:13         ` Morten Brørup
  2020-08-27 11:41           ` Bruce Richardson
  0 siblings, 1 reply; 48+ messages in thread
From: Morten Brørup @ 2020-08-27 10:13 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Thursday, August 27, 2020 11:44 AM
> 
> On Thu, Aug 27, 2020 at 11:31:15AM +0200, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Thursday, August 27, 2020 11:10 AM
> > >
> > > On Thu, Aug 27, 2020 at 10:40:11AM +0200, Morten Brørup wrote:
> > > > Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,
> > > >
> > > > I'm hijacking this patch thread to propose a small API
> modification
> > > that prevents unnecessarily performance degradations.
> > > >
> > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> > > > > Sent: Thursday, August 27, 2020 9:55 AM
> > > > >
> > > > > The limitation of burst size in vector rx was removed, since it
> > > should
> > > > > retrieve as much received packets as possible. And also the
> > > scattered
> > > > > receive path should use a wrapper function to achieve the goal
> of
> > > > > burst maximizing.
> > > > >
> > > > > This patch set aims to maximize vector rx burst for for
> > > > > ixgbe/i40e/ice/iavf PMDs.
> > > > >
> > > >
> > > > Now I'm going to be pedantic and say that it still doesn't
> conform to
> > > the rte_eth_rx_burst() API, because the API does not specify any
> > > minimum requirement for nb_pkts.
> > > >
> > > > In theory, that could also be fixed in the driver by calling the
> non-
> > > vector function from the vector functions if nb_pkts is too small
> for
> > > the vector implementation.
> > > >
> > > > However, I think that calling rte_eth_rx_burst() with a small
> nb_pkts
> > > is silly and not in the spirit of DPDK, and introducing an
> additional
> > > comparison for a small nb_pkts in the driver vector functions would
> > > degrade their performance (only slightly, but anyway).
> > > >
> > >
> > > Actually, I'd like to see a confirmed measurement showing a
> slowdown
> > > before
> > > we discard such an option. :-)
> >
> > Good point!
> >
> > > While I agree that using small bursts is
> > > not
> > > keeping with the design approach of DPDK of using large bursts to
> > > amortize
> > > costs and allow prefetching, there are cases where a user/app may
> want
> > > a
> > > small burst size, e.g. 4, for latency reasons, and we need a way to
> > > support
> > > that.
> > >
> > I assume that calling rte_eth_rx_burst() with nb_pkts=32 returns 4
> packets if only 4 packets are available, so you would need to be
> extremely latency sensitive to call it with a smaller nb_pkts. I guess
> that high frequency trading is the only real life scenario here.
> >
> Yes, it really boils down to whether you are prepared to accept lower
> max throughput or dropped packets in order to gain lower latency.
> 
> > > Since the path selection is dynamic, we need to either:
> > > a) provide a way for the user to specify that they will use smaller
> > > bursts
> > > and so that vector functions should not be used
> > > b) have the vector functions transparently fallback to the scalar
> ones
> > > if
> > > used with smaller bursts
> > >
> > > Of these, option b) is simpler, and should be low cost since any
> check
> > > is
> > > just once per burst, and - assuming an app is written using the
> same
> > > request-size each time - should be entirely predictable after the
> first
> > > call.
> > >
> > Why does everyone assume that DPDK applications are so simple that
> the branch predictor will cover the entire data path? I hear this
> argument over and over again, and by principle I disagree with it!
> >
> 
> Fair enough, that was an assumption on my part. Do you see in your apps
> many cases where branches are getting mispredicted despite going the
> same
> way each time though the code?
> 
We haven't looked deeply into this, but I don't think so.

My objection is of a more general nature. As a library, DPDK cannot assume that applications using it are simple, and - based on that assumption - take away resources that could have been available for the application.

The Intel general optimization guidelines specifies that code should be arranged to be consistent with the static branch prediction algorithm: make the fall-through code following a conditional branch be the likely target for a branch with a forward target, and make the fall-through code following a conditional branch be the unlikely target for a branch with a backward target.

It also says: Conditional branches that are never taken do not consume BTB resources.

Somehow this last detail is completely ignored by DPDK developers.

We put a lot of effort into conserving resources in most areas in DPDK, but when it comes to the branch prediction target buffer (BTB), we gladly organize code with branches turning the wrong way, thus unnecessarily consuming BTB entries. And the argument goes: The branch predictor will catch it after the first time.

> > How about c): add rte_eth_rx() and rte_eth_tx() functions for
> receiving/transmitting a single packet. The ring library has such
> functions.
> >
> > Optimized single-packet functions might even perform better than
> calling the burst functions with nb_pkts=1. Great for latency focused
> applications. :-)
> >
> That is another option, yes.
> A further option is to add to the vector code a one-off switch to check
> first
> time it's called that the request size is not lower than the min
> supported
> (again basing on the assumption that one is not going to be varying the
> burst size asked - which may not be true in call cases but won't leave
> us
> any worse off than we are now!).

I certainly don't support this option. But it was worth mentioning.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts
  2020-08-27 10:13         ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts Morten Brørup
@ 2020-08-27 11:41           ` Bruce Richardson
  2020-08-28  9:03             ` Morten Brørup
  0 siblings, 1 reply; 48+ messages in thread
From: Bruce Richardson @ 2020-08-27 11:41 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

On Thu, Aug 27, 2020 at 12:13:51PM +0200, Morten Brørup wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Thursday, August 27, 2020 11:44 AM
> > 
> > On Thu, Aug 27, 2020 at 11:31:15AM +0200, Morten Brørup wrote:
> > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > Sent: Thursday, August 27, 2020 11:10 AM
> > > >
> > > > On Thu, Aug 27, 2020 at 10:40:11AM +0200, Morten Brørup wrote:
> > > > > Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,
> > > > >
> > > > > I'm hijacking this patch thread to propose a small API
> > modification
> > > > that prevents unnecessarily performance degradations.
> > > > >
> > > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> > > > > > Sent: Thursday, August 27, 2020 9:55 AM
> > > > > >
> > > > > > The limitation of burst size in vector rx was removed, since it
> > > > should
> > > > > > retrieve as much received packets as possible. And also the
> > > > scattered
> > > > > > receive path should use a wrapper function to achieve the goal
> > of
> > > > > > burst maximizing.
> > > > > >
> > > > > > This patch set aims to maximize vector rx burst for for
> > > > > > ixgbe/i40e/ice/iavf PMDs.
> > > > > >
> > > > >
> > > > > Now I'm going to be pedantic and say that it still doesn't
> > conform to
> > > > the rte_eth_rx_burst() API, because the API does not specify any
> > > > minimum requirement for nb_pkts.
> > > > >
> > > > > In theory, that could also be fixed in the driver by calling the
> > non-
> > > > vector function from the vector functions if nb_pkts is too small
> > for
> > > > the vector implementation.
> > > > >
> > > > > However, I think that calling rte_eth_rx_burst() with a small
> > nb_pkts
> > > > is silly and not in the spirit of DPDK, and introducing an
> > additional
> > > > comparison for a small nb_pkts in the driver vector functions would
> > > > degrade their performance (only slightly, but anyway).
> > > > >
> > > >
> > > > Actually, I'd like to see a confirmed measurement showing a
> > slowdown
> > > > before
> > > > we discard such an option. :-)
> > >
> > > Good point!
> > >
> > > > While I agree that using small bursts is
> > > > not
> > > > keeping with the design approach of DPDK of using large bursts to
> > > > amortize
> > > > costs and allow prefetching, there are cases where a user/app may
> > want
> > > > a
> > > > small burst size, e.g. 4, for latency reasons, and we need a way to
> > > > support
> > > > that.
> > > >
> > > I assume that calling rte_eth_rx_burst() with nb_pkts=32 returns 4
> > packets if only 4 packets are available, so you would need to be
> > extremely latency sensitive to call it with a smaller nb_pkts. I guess
> > that high frequency trading is the only real life scenario here.
> > >
> > Yes, it really boils down to whether you are prepared to accept lower
> > max throughput or dropped packets in order to gain lower latency.
> > 
> > > > Since the path selection is dynamic, we need to either:
> > > > a) provide a way for the user to specify that they will use smaller
> > > > bursts
> > > > and so that vector functions should not be used
> > > > b) have the vector functions transparently fallback to the scalar
> > ones
> > > > if
> > > > used with smaller bursts
> > > >
> > > > Of these, option b) is simpler, and should be low cost since any
> > check
> > > > is
> > > > just once per burst, and - assuming an app is written using the
> > same
> > > > request-size each time - should be entirely predictable after the
> > first
> > > > call.
> > > >
> > > Why does everyone assume that DPDK applications are so simple that
> > the branch predictor will cover the entire data path? I hear this
> > argument over and over again, and by principle I disagree with it!
> > >
> > 
> > Fair enough, that was an assumption on my part. Do you see in your apps
> > many cases where branches are getting mispredicted despite going the
> > same
> > way each time though the code?
> > 
> We haven't looked deeply into this, but I don't think so.
> 
> My objection is of a more general nature. As a library, DPDK cannot assume that applications using it are simple, and - based on that assumption - take away resources that could have been available for the application.
> 
> The Intel general optimization guidelines specifies that code should be arranged to be consistent with the static branch prediction algorithm: make the fall-through code following a conditional branch be the likely target for a branch with a forward target, and make the fall-through code following a conditional branch be the unlikely target for a branch with a backward target.
> 
> It also says: Conditional branches that are never taken do not consume BTB resources.
> 
> Somehow this last detail is completely ignored by DPDK developers.
> 
> We put a lot of effort into conserving resources in most areas in DPDK, but when it comes to the branch prediction target buffer (BTB), we gladly organize code with branches turning the wrong way, thus unnecessarily consuming BTB entries. And the argument goes: The branch predictor will catch it after the first time.
>

Looks like something to investigate more. Thanks for bringing this up.
 
> > > How about c): add rte_eth_rx() and rte_eth_tx() functions for
> > receiving/transmitting a single packet. The ring library has such
> > functions.
> > >
> > > Optimized single-packet functions might even perform better than
> > calling the burst functions with nb_pkts=1. Great for latency focused
> > applications. :-)
> > >
> > That is another option, yes.
> > A further option is to add to the vector code a one-off switch to check
> > first
> > time it's called that the request size is not lower than the min
> > supported
> > (again basing on the assumption that one is not going to be varying the
> > burst size asked - which may not be true in call cases but won't leave
> > us
> > any worse off than we are now!).
> 
> I certainly don't support this option. But it was worth mentioning.
> 

Right. For now then, it seems like just documenting a minimum burst size is
reasonable.

/Bruce

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts
  2020-08-27 11:41           ` Bruce Richardson
@ 2020-08-28  9:03             ` Morten Brørup
  2020-08-28 10:07               ` Bruce Richardson
  0 siblings, 1 reply; 48+ messages in thread
From: Morten Brørup @ 2020-08-28  9:03 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Thursday, August 27, 2020 1:41 PM
> 
> On Thu, Aug 27, 2020 at 12:13:51PM +0200, Morten Brørup wrote:
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce
> Richardson
> > > Sent: Thursday, August 27, 2020 11:44 AM
> > >
> > > On Thu, Aug 27, 2020 at 11:31:15AM +0200, Morten Brørup wrote:
> > > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > > Sent: Thursday, August 27, 2020 11:10 AM
> > > > >
> > > > > On Thu, Aug 27, 2020 at 10:40:11AM +0200, Morten Brørup wrote:
> > > > > > Jeff and Ethernet API maintainers Thomas, Ferruh and Andrew,
> > > > > >
> > > > > > I'm hijacking this patch thread to propose a small API
> > > modification
> > > > > that prevents unnecessarily performance degradations.
> > > > > >
> > > > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff
> Guo
> > > > > > > Sent: Thursday, August 27, 2020 9:55 AM
> > > > > > >
> > > > > > > The limitation of burst size in vector rx was removed,
> since it
> > > > > should
> > > > > > > retrieve as much received packets as possible. And also the
> > > > > scattered
> > > > > > > receive path should use a wrapper function to achieve the
> goal
> > > of
> > > > > > > burst maximizing.
> > > > > > >
> > > > > > > This patch set aims to maximize vector rx burst for for
> > > > > > > ixgbe/i40e/ice/iavf PMDs.
> > > > > > >
> > > > > >
> > > > > > Now I'm going to be pedantic and say that it still doesn't
> > > conform to
> > > > > the rte_eth_rx_burst() API, because the API does not specify
> any
> > > > > minimum requirement for nb_pkts.
> > > > > >
> > > > > > In theory, that could also be fixed in the driver by calling
> the
> > > non-
> > > > > vector function from the vector functions if nb_pkts is too
> small
> > > for
> > > > > the vector implementation.
> > > > > >
> > > > > > However, I think that calling rte_eth_rx_burst() with a small
> > > nb_pkts
> > > > > is silly and not in the spirit of DPDK, and introducing an
> > > additional
> > > > > comparison for a small nb_pkts in the driver vector functions
> would
> > > > > degrade their performance (only slightly, but anyway).
> > > > > >
> > > > >
> > > > > Actually, I'd like to see a confirmed measurement showing a
> > > slowdown
> > > > > before
> > > > > we discard such an option. :-)
> > > >
> > > > Good point!
> > > >
> > > > > While I agree that using small bursts is
> > > > > not
> > > > > keeping with the design approach of DPDK of using large bursts
> to
> > > > > amortize
> > > > > costs and allow prefetching, there are cases where a user/app
> may
> > > want
> > > > > a
> > > > > small burst size, e.g. 4, for latency reasons, and we need a
> way to
> > > > > support
> > > > > that.
> > > > >
> > > > I assume that calling rte_eth_rx_burst() with nb_pkts=32 returns
> 4
> > > packets if only 4 packets are available, so you would need to be
> > > extremely latency sensitive to call it with a smaller nb_pkts. I
> guess
> > > that high frequency trading is the only real life scenario here.
> > > >
> > > Yes, it really boils down to whether you are prepared to accept
> lower
> > > max throughput or dropped packets in order to gain lower latency.
> > >
> > > > > Since the path selection is dynamic, we need to either:
> > > > > a) provide a way for the user to specify that they will use
> smaller
> > > > > bursts
> > > > > and so that vector functions should not be used

After thinking about it, and also inspired by Haiyue's comment in the other thread, this may be a good option. Configure the driver with the burst size that the application is going to use, so the driver can select the appropriate function. The driver could even select another vector function, optimized for the specific size.

It could be a field in the rte_eth_rxmode structure, which is part of the rte_eth_conf structure used in the rte_eth_dev_configure() function.

This would also make the nb_pkts in the rte_eth_rx_burst() obsolete. So we could add a function without the nb_pkts parameter.

Alternatively, we could introduce new variants of the rte_eth_rx_burst() function with fixed burst size, e.g. rte_eth_rx_burst32() to receive a burst of 32 packets, rte_eth_rx_burst64() for 64 packets, and rte_eth_rx_burst128(). Assuming that the performance difference beyond a certain vector size is insignificant, e.g. 128 packets, we don't need to add functions for larger vectors than that.

Or my suggestion here might just be a case of over-optimizing with insignificant performance benefits.

> > > > > b) have the vector functions transparently fallback to the
> scalar
> > > ones
> > > > > if
> > > > > used with smaller bursts
> > > > >
> > > > > Of these, option b) is simpler, and should be low cost since
> any
> > > check
> > > > > is
> > > > > just once per burst, and - assuming an app is written using the
> > > same
> > > > > request-size each time - should be entirely predictable after
> the
> > > first
> > > > > call.
> > > > >
> > > > Why does everyone assume that DPDK applications are so simple
> that
> > > the branch predictor will cover the entire data path? I hear this
> > > argument over and over again, and by principle I disagree with it!
> > > >
> > >
> > > Fair enough, that was an assumption on my part. Do you see in your
> apps
> > > many cases where branches are getting mispredicted despite going
> the
> > > same
> > > way each time though the code?
> > >
> > We haven't looked deeply into this, but I don't think so.
> >
> > My objection is of a more general nature. As a library, DPDK cannot
> assume that applications using it are simple, and - based on that
> assumption - take away resources that could have been available for the
> application.
> >
> > The Intel general optimization guidelines specifies that code should
> be arranged to be consistent with the static branch prediction
> algorithm: make the fall-through code following a conditional branch be
> the likely target for a branch with a forward target, and make the
> fall-through code following a conditional branch be the unlikely target
> for a branch with a backward target.
> >
> > It also says: Conditional branches that are never taken do not
> consume BTB resources.
> >
> > Somehow this last detail is completely ignored by DPDK developers.
> >
> > We put a lot of effort into conserving resources in most areas in
> DPDK, but when it comes to the branch prediction target buffer (BTB),
> we gladly organize code with branches turning the wrong way, thus
> unnecessarily consuming BTB entries. And the argument goes: The branch
> predictor will catch it after the first time.
> >
> 
> Looks like something to investigate more. Thanks for bringing this up.
> 
> > > > How about c): add rte_eth_rx() and rte_eth_tx() functions for
> > > receiving/transmitting a single packet. The ring library has such
> > > functions.
> > > >
> > > > Optimized single-packet functions might even perform better than
> > > calling the burst functions with nb_pkts=1. Great for latency
> focused
> > > applications. :-)
> > > >
> > > That is another option, yes.
> > > A further option is to add to the vector code a one-off switch to
> check
> > > first
> > > time it's called that the request size is not lower than the min
> > > supported
> > > (again basing on the assumption that one is not going to be varying
> the
> > > burst size asked - which may not be true in call cases but won't
> leave
> > > us
> > > any worse off than we are now!).
> >
> > I certainly don't support this option. But it was worth mentioning.
> >
> 
> Right. For now then, it seems like just documenting a minimum burst
> size is
> reasonable.

I agree. It is so far from the spirit of DPDK to call rte_eth_rx_burst() with a small nb_pkts that the driver developers didn't even consider it. The API documentation needs fixing, not the drivers.

It doesn't take care of your example 4 packet latency sensitive application, though. Which BTW also doesn’t work today on drivers with vector support. So it might not be a real world scenario anyway. :-)

> 
> /Bruce


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts
  2020-08-28  9:03             ` Morten Brørup
@ 2020-08-28 10:07               ` Bruce Richardson
  2020-08-28 10:50                 ` Morten Brørup
  2020-08-29 10:15                 ` Morten Brørup
  0 siblings, 2 replies; 48+ messages in thread
From: Bruce Richardson @ 2020-08-28 10:07 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

On Fri, Aug 28, 2020 at 11:03:59AM +0200, Morten Brørup wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> >
<snip>
> > 
> > Right. For now then, it seems like just documenting a minimum burst
> > size is
> > reasonable.
> 
> I agree. It is so far from the spirit of DPDK to call rte_eth_rx_burst() with a small nb_pkts that the driver developers didn't even consider it. The API documentation needs fixing, not the drivers.
> 
> It doesn't take care of your example 4 packet latency sensitive application, though. Which BTW also doesn’t work today on drivers with vector support. So it might not be a real world scenario anyway. :-)
> 
AFAIK, 8 is the smallest burst guaranteed to work everywhere, but I think
just about everything bar the AVX2 i40e code path also supports 4 as a
burst size. Therefore adjusting to 4 as min-burst might well be reasonable.

/Bruce

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts
  2020-08-28 10:07               ` Bruce Richardson
@ 2020-08-28 10:50                 ` Morten Brørup
  2020-08-29 10:15                 ` Morten Brørup
  1 sibling, 0 replies; 48+ messages in thread
From: Morten Brørup @ 2020-08-28 10:50 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Friday, August 28, 2020 12:07 PM
> 
> On Fri, Aug 28, 2020 at 11:03:59AM +0200, Morten Brørup wrote:
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce
> Richardson
> > >
> <snip>
> > >
> > > Right. For now then, it seems like just documenting a minimum burst
> > > size is
> > > reasonable.
> >
> > I agree. It is so far from the spirit of DPDK to call
> rte_eth_rx_burst() with a small nb_pkts that the driver developers
> didn't even consider it. The API documentation needs fixing, not the
> drivers.
> >
> > It doesn't take care of your example 4 packet latency sensitive
> application, though. Which BTW also doesn’t work today on drivers with
> vector support. So it might not be a real world scenario anyway. :-)
> >
> AFAIK, 8 is the smallest burst guaranteed to work everywhere, but I
> think
> just about everything bar the AVX2 i40e code path also supports 4 as a
> burst size. Therefore adjusting to 4 as min-burst might well be
> reasonable.
> 
> /Bruce

There must be a reason the i40e AVX2 driver chose to step up to 8 from the previous convention of 4.

Considering Intel's stance on the controversial vector instructions, a larger numbers seems more future proof. HPC benefits from the vector instructions, and DPDK seems to benefit from them too. Let's not prevent that.

Since I don't have insight into Intel's (or any other CPU vendors') plans for future vector instructions, I will assume that 8 suffices for the foreseeable future, and thus I am leaning towards 8 rather than 4.

-Morten

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts
  2020-08-28 10:07               ` Bruce Richardson
  2020-08-28 10:50                 ` Morten Brørup
@ 2020-08-29 10:15                 ` Morten Brørup
  1 sibling, 0 replies; 48+ messages in thread
From: Morten Brørup @ 2020-08-29 10:15 UTC (permalink / raw)
  To: Morten Brørup, Bruce Richardson
  Cc: Jeff Guo, Thomas Monjalon, Ferruh Yigit, Andrew Rybchenko,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu,
	dev, helin.zhang, barbette

> From: Morten Brørup
> Sent: Friday, August 28, 2020 12:51 PM
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Friday, August 28, 2020 12:07 PM
> >
> > On Fri, Aug 28, 2020 at 11:03:59AM +0200, Morten Brørup wrote:
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce
> > Richardson
> > > >
> > <snip>
> > > >
> > > > Right. For now then, it seems like just documenting a minimum burst
> > > > size is
> > > > reasonable.
> > >
> > > I agree. It is so far from the spirit of DPDK to call
> > rte_eth_rx_burst() with a small nb_pkts that the driver developers
> > didn't even consider it. The API documentation needs fixing, not the
> > drivers.
> > >
> > > It doesn't take care of your example 4 packet latency sensitive
> > application, though. Which BTW also doesn’t work today on drivers with
> > vector support. So it might not be a real world scenario anyway. :-)
> > >
> > AFAIK, 8 is the smallest burst guaranteed to work everywhere, but I
> > think
> > just about everything bar the AVX2 i40e code path also supports 4 as a
> > burst size. Therefore adjusting to 4 as min-burst might well be
> > reasonable.
> >
> > /Bruce
> 
> There must be a reason the i40e AVX2 driver chose to step up to 8 from the
> previous convention of 4.
> 
> Considering Intel's stance on the controversial vector instructions, a
> larger numbers seems more future proof. HPC benefits from the vector
> instructions, and DPDK seems to benefit from them too. Let's not prevent
> that.
> 
> Since I don't have insight into Intel's (or any other CPU vendors') plans
> for future vector instructions, I will assume that 8 suffices for the
> foreseeable future, and thus I am leaning towards 8 rather than 4.
> 

nb_pkts must be >= 8 and divisible by 8.

Alternatively to being divisible by 8, would there be any benefit in requiring that it is a power-of-two?

-Morten


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
                   ` (4 preceding siblings ...)
  2020-08-27  8:40 ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts Morten Brørup
@ 2020-09-09  6:36 ` Jeff Guo
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
                     ` (5 more replies)
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
  7 siblings, 6 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-09  6:36 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of burst
maximizing.

This patch set aims to maximize vector rx burst for for
ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.

v3->v2:
1:move define into header file.
2:delete some useless doc.

v2->v1:
1:add fm10k driver case
2:refine some doc

Jeff Guo (5):
  net/iavf: fix vector rx burst for iavf
  net/ixgbe: fix vector rx burst for ixgbe
  net/i40e: fix vector rx burst for i40e
  net/ice: fix vector rx burst for ice
  net/fm10k: fix vector rx burst for fm10k

 drivers/net/fm10k/fm10k_rxtx_vec.c       |  42 ++++++--
 drivers/net/i40e/i40e_rxtx.h             |   1 +
 drivers/net/i40e/i40e_rxtx_vec_altivec.c |  64 +++++++-----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c    |  29 +++---
 drivers/net/i40e/i40e_rxtx_vec_neon.c    |  58 +++++++----
 drivers/net/i40e/i40e_rxtx_vec_sse.c     |  58 +++++++----
 drivers/net/iavf/iavf_rxtx.h             |   1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c    |  78 +++++++--------
 drivers/net/iavf/iavf_rxtx_vec_sse.c     | 119 +++++++++++++++--------
 drivers/net/ice/ice_rxtx.h               |   1 +
 drivers/net/ice/ice_rxtx_vec_avx2.c      |  23 +++--
 drivers/net/ice/ice_rxtx_vec_sse.c       |  56 +++++++----
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c  |  77 ++++++++-------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c   |  61 +++++++-----
 14 files changed, 412 insertions(+), 256 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v3 1/5] net/iavf: fix vector rx burst for iavf
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
@ 2020-09-09  6:36   ` Jeff Guo
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 2/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-09  6:36 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/iavf/iavf_rxtx.h          |   1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c |  78 ++++++++---------
 drivers/net/iavf/iavf_rxtx_vec_sse.c  | 119 ++++++++++++++++++--------
 3 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 59625a979..f71f9fbdb 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -21,6 +21,7 @@
 #define IAVF_VPMD_TX_MAX_BURST    32
 #define IAVF_RXQ_REARM_THRESH     32
 #define IAVF_VPMD_DESCS_PER_LOOP  4
+#define IAVF_VPMD_DESCS_PER_LOOP_AVX  8
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
 #define IAVF_NO_VECTOR_FLAGS (				 \
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index e5e0fd309..9816adbaa 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -29,7 +29,7 @@ iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 			__m128i dma_addr0;
 
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP_AVX; i++) {
 				rxp[i] = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -134,13 +134,19 @@ iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 
 #define PKTLEN_SHIFT     10
 
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 			     struct rte_mbuf **rx_pkts,
 			     uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define IAVF_DESCS_PER_LOOP_AVX 8
-
 	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
 
@@ -153,8 +159,8 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 
 	rte_prefetch0(rxdp);
 
-	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP_AVX);
 
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
@@ -297,8 +303,8 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
-	     i += IAVF_DESCS_PER_LOOP_AVX,
-	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+	     i += IAVF_VPMD_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_VPMD_DESCS_PER_LOOP_AVX) {
 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
 		_mm256_storeu_si256((void *)&rx_pkts[i],
 				    _mm256_loadu_si256((void *)&sw_ring[i]));
@@ -368,7 +374,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 		if (split_packet) {
 			int j;
 
-			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+			for (j = 0; j < IAVF_VPMD_DESCS_PER_LOOP_AVX; j++)
 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
 		}
 
@@ -583,7 +589,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
 			*(uint64_t *)split_packet =
 				_mm_cvtsi128_si64(split_bits);
-			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP_AVX;
 		}
 
 		/* perform dd_check */
@@ -599,7 +605,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 				(_mm_cvtsi128_si64
 					(_mm256_castsi256_si128(status0_7)));
 		received += burst;
-		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+		if (burst != IAVF_VPMD_DESCS_PER_LOOP_AVX)
 			break;
 	}
 
@@ -633,13 +639,19 @@ flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7)
 	return fdir_flags;
 }
 
+/**
+ * vPMD raw receive routine,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 				      struct rte_mbuf **rx_pkts,
 				      uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define IAVF_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
@@ -650,8 +662,8 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 
 	rte_prefetch0(rxdp);
 
-	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP_AVX);
 
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
@@ -794,8 +806,8 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
-	     i += IAVF_DESCS_PER_LOOP_AVX,
-	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+	     i += IAVF_VPMD_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_VPMD_DESCS_PER_LOOP_AVX) {
 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
 		_mm256_storeu_si256((void *)&rx_pkts[i],
 				    _mm256_loadu_si256((void *)&sw_ring[i]));
@@ -851,7 +863,7 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 		if (split_packet) {
 			int j;
 
-			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+			for (j = 0; j < IAVF_VPMD_DESCS_PER_LOOP_AVX; j++)
 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
 		}
 
@@ -1193,7 +1205,7 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
 			*(uint64_t *)split_packet =
 				_mm_cvtsi128_si64(split_bits);
-			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP_AVX;
 		}
 
 		/* perform dd_check */
@@ -1209,7 +1221,7 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 				(_mm_cvtsi128_si64
 					(_mm256_castsi256_si128(status0_7)));
 		received += burst;
-		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+		if (burst != IAVF_VPMD_DESCS_PER_LOOP_AVX)
 			break;
 	}
 
@@ -1224,10 +1236,6 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	return received;
 }
 
-/**
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts)
@@ -1235,10 +1243,6 @@ iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/**
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts)
@@ -1249,8 +1253,6 @@ iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -1259,6 +1261,9 @@ iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 						       split_flags);
@@ -1290,9 +1295,6 @@ iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -1313,10 +1315,8 @@ iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 }
 
 /**
- * vPMD receive routine that reassembles single burst of
- * 32 scattered packets for flex RxD
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
  */
 static uint16_t
 iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
@@ -1326,6 +1326,9 @@ iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq,
 					rx_pkts, nb_pkts, split_flags);
@@ -1357,9 +1360,6 @@ iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
 
 /**
  * vPMD receive routine that reassembles scattered packets for flex RxD.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 85c5bd4af..b5362ecf3 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -379,10 +379,12 @@ flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)];
 }
 
-/* Notice:
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -413,9 +415,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -627,10 +626,13 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/* Notice:
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
@@ -688,9 +690,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -921,11 +920,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	return nb_pkts_recd;
 }
 
-/* Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
- */
 uint16_t
 iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		  uint16_t nb_pkts)
@@ -933,11 +927,6 @@ iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
- */
 uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
@@ -945,20 +934,20 @@ iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -986,21 +975,48 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/* vPMD receive routine that reassembles scattered packets for flex RxD
- * Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets.
  */
 uint16_t
-iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
-				      struct rte_mbuf **rx_pkts,
-				      uint16_t nb_pkts)
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1028,6 +1044,33 @@ iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v3 2/5] net/ixgbe: fix vector rx burst for ixgbe
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
@ 2020-09-09  6:36   ` Jeff Guo
       [not found]     ` <VI1PR0802MB23518C6B517B6EAD8E018CD49E260@VI1PR0802MB2351.eurprd08.prod.outlook.com>
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 3/5] net/i40e: fix vector rx burst for i40e Jeff Guo
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 48+ messages in thread
From: Jeff Guo @ 2020-09-09  6:36 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 77 +++++++++++++------------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c  | 61 +++++++++++---------
 2 files changed, 76 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index aa27ee177..7692c5d59 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -130,17 +130,6 @@ desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
 	rx_pkts[3]->ol_flags = vol.e[3];
 }
 
-/*
- * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
-
 #define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
 #define IXGBE_UINT8_BIT			(CHAR_BIT * sizeof(uint8_t))
 
@@ -206,6 +195,13 @@ desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,
 				vgetq_lane_u32(tunnel_check, 3));
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -226,9 +222,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
 				 rxq->crc_len, 0, 0, 0};
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -382,16 +375,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
- * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
 uint16_t
 ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts)
@@ -399,23 +382,19 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
- *
- * Notice:
- * - don't support ol_flags for rss and csum err
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -443,6 +422,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca3166..cf54ff128 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -302,13 +302,11 @@ desc_to_ptype_v(__m128i descs[4], uint16_t pkt_type_mask,
 		get_packet_type(3, pkt_info, etqf_check, tunnel_check);
 }
 
-/*
+/**
  * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
 static inline uint16_t
@@ -344,9 +342,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	__m128i mbuf_init;
 	uint8_t vlan_flags;
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -556,15 +551,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
- * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- */
 uint16_t
 ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts)
@@ -572,22 +558,19 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -615,6 +598,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v3 3/5] net/i40e: fix vector rx burst for i40e
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 2/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
@ 2020-09-09  6:36   ` Jeff Guo
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice Jeff Guo
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-09  6:36 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/i40e/i40e_rxtx.h             |  1 +
 drivers/net/i40e/i40e_rxtx_vec_altivec.c | 64 ++++++++++++++++--------
 drivers/net/i40e/i40e_rxtx_vec_avx2.c    | 29 ++++++-----
 drivers/net/i40e/i40e_rxtx_vec_neon.c    | 58 +++++++++++++--------
 drivers/net/i40e/i40e_rxtx_vec_sse.c     | 58 +++++++++++++--------
 5 files changed, 133 insertions(+), 77 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 57d7b4160..01d4609f9 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -14,6 +14,7 @@
 #define RTE_I40E_MAX_RX_BURST          RTE_I40E_RXQ_REARM_THRESH
 #define RTE_I40E_TX_MAX_FREE_BUF_SZ    64
 #define RTE_I40E_DESCS_PER_LOOP    4
+#define RTE_I40E_DESCS_PER_LOOP_AVX    8
 
 #define I40E_RXBUF_SZ_1024 1024
 #define I40E_RXBUF_SZ_2048 2048
diff --git a/drivers/net/i40e/i40e_rxtx_vec_altivec.c b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
index 6862a017e..345c63aa7 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_altivec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
@@ -188,11 +188,13 @@ desc_to_ptype_v(vector unsigned long descs[4], struct rte_mbuf **rx_pkts,
 		ptype_tbl[(*(vector unsigned char *)&ptype1)[8]];
 }
 
- /* Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -214,9 +216,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		};
 	vector unsigned long dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -447,11 +446,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
- /* Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
 uint16_t
 i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -459,19 +453,19 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
-  * Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -500,6 +494,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 	struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef1363..b5e6867d0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -36,7 +36,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 		    rxq->nb_rx_desc) {
 			__m128i dma_addr0;
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP_AVX; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -219,13 +219,18 @@ desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
 
 #define PKTLEN_SHIFT     10
 
-/* Force inline as some compilers will not inline by default. */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP_AVX power-of-two
+ * - force inline as some compilers will not inline by default
+ */
 static __rte_always_inline uint16_t
 _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define RTE_I40E_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -729,10 +734,6 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -740,10 +741,8 @@ i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -752,6 +751,9 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -781,11 +783,8 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 i40e_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 6f874e45b..143cdf4a5 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -187,11 +187,12 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 
 }
 
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
@@ -230,9 +231,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 		0, 0, 0       /* ignore non-length fields */
 		};
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -426,12 +424,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 	return nb_pkts_recd;
 }
 
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 		struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
@@ -439,20 +431,20 @@ i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -482,6 +474,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..605912246 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -342,11 +342,12 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
 }
 
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -378,9 +379,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -592,12 +590,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -605,20 +597,20 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -648,6 +640,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
                     ` (2 preceding siblings ...)
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 3/5] net/i40e: fix vector rx burst for i40e Jeff Guo
@ 2020-09-09  6:36   ` Jeff Guo
  2020-09-15  7:10     ` Han, YingyaX
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 5/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
  2020-09-09  6:45   ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Wang, Haiyue
  5 siblings, 1 reply; 48+ messages in thread
From: Jeff Guo @ 2020-09-09  6:36 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/ice/ice_rxtx.h          |  1 +
 drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
 drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 2fdcfb7d0..3ef5f300d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -35,6 +35,7 @@
 #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
 #define ICE_TX_MAX_FREE_BUF_SZ      64
 #define ICE_DESCS_PER_LOOP          4
+#define ICE_DESCS_PER_LOOP_AVX	    8
 
 #define ICE_FDIR_PKT_LEN	512
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..843e4f32a 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			__m128i dma_addr0;
 
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define ICE_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		       uint16_t nb_pkts)
@@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -626,6 +625,9 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 						       split_flags);
@@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..c03e24092 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -205,10 +205,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 }
 
 /**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -264,9 +265,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 
@@ -441,12 +439,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		  uint16_t nb_pkts)
@@ -454,19 +446,19 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
 {
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 						  split_flags);
@@ -496,6 +488,32 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 					     &split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     nb_pkts);
+}
+
 static inline void
 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v3 5/5] net/fm10k: fix vector rx burst for fm10k
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
                     ` (3 preceding siblings ...)
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice Jeff Guo
@ 2020-09-09  6:36   ` Jeff Guo
  2020-09-09  6:45   ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Wang, Haiyue
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-09  6:36 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette

The scattered receive path should use a wrapper function to achieve the
goal of burst maximizing. And do some code cleaning for vector rx path.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c | 42 +++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index eff3933b5..3b25c570b 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -645,25 +645,23 @@ fm10k_reassemble_packets(struct fm10k_rx_queue *rxq,
 	return pkt_idx;
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  *
  * Notice:
  * - don't support ol_flags for rss and csum err
- * - nb_pkts > RTE_FM10K_MAX_RX_BURST, only scan RTE_FM10K_MAX_RX_BURST
- *   numbers of DD bit
  */
-uint16_t
-fm10k_recv_scattered_pkts_vec(void *rx_queue,
-				struct rte_mbuf **rx_pkts,
-				uint16_t nb_pkts)
+static uint16_t
+fm10k_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct fm10k_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_FM10K_MAX_RX_BURST] = {0};
 	unsigned i = 0;
 
-	/* Split_flags only can support max of RTE_FM10K_MAX_RX_BURST */
+	/* split_flags only can support max of RTE_FM10K_MAX_RX_BURST */
 	nb_pkts = RTE_MIN(nb_pkts, RTE_FM10K_MAX_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = fm10k_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -691,6 +689,32 @@ fm10k_recv_scattered_pkts_vec(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+fm10k_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_FM10K_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = fm10k_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_FM10K_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_FM10K_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + fm10k_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static const struct fm10k_txq_ops vec_txq_ops = {
 	.reset = fm10k_reset_tx_queue,
 };
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
                     ` (4 preceding siblings ...)
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 5/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
@ 2020-09-09  6:45   ` Wang, Haiyue
  2020-09-09  7:03     ` Guo, Jia
  5 siblings, 1 reply; 48+ messages in thread
From: Wang, Haiyue @ 2020-09-09  6:45 UTC (permalink / raw)
  To: Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1, Wei, Zhang, Qi Z,
	Wu, Jingjing
  Cc: Richardson, Bruce, dev, Zhang, Helin, mb, Yigit, Ferruh, stephen,
	barbette

> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Wednesday, September 9, 2020 14:37
> To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhao1, Wei
> <wei.zhao1@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia <jia.guo@intel.com>; Zhang,
> Helin <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>; Wang,
> Haiyue <haiyue.wang@intel.com>; stephen@networkplumber.org; barbette@kth.se
> Subject: [PATCH v3 0/5] fix vector rx burst for PMDs
> 
> The limitation of burst size in vector rx was removed, since it should
> retrieve as much received packets as possible. And also the scattered
> receive path should use a wrapper function to achieve the goal of burst
> maximizing.
> 
> This patch set aims to maximize vector rx burst for for
> ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.

IMO, this can be "enhance the vector ...", but not "fix ...".
If "fix", it needs "Fixes:" tags ... ;-)

> 
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-09-09  6:45   ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Wang, Haiyue
@ 2020-09-09  7:03     ` Guo, Jia
  2020-09-09  7:05       ` Wang, Haiyue
  0 siblings, 1 reply; 48+ messages in thread
From: Guo, Jia @ 2020-09-09  7:03 UTC (permalink / raw)
  To: Wang, Haiyue, Yang, Qiming, Xing, Beilei, Zhao1, Wei, Zhang,
	Qi Z, Wu, Jingjing
  Cc: Richardson, Bruce, dev, Zhang, Helin, mb, Yigit, Ferruh, stephen,
	barbette

Hi, haiyue

> -----Original Message-----
> From: Wang, Haiyue <haiyue.wang@intel.com>
> Sent: Wednesday, September 9, 2020 2:45 PM
> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>;
> Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang,
> Helin <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se
> Subject: RE: [PATCH v3 0/5] fix vector rx burst for PMDs
> 
> > -----Original Message-----
> > From: Guo, Jia <jia.guo@intel.com>
> > Sent: Wednesday, September 9, 2020 14:37
> > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> > Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Guo,
> > Jia <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > Wang, Haiyue <haiyue.wang@intel.com>; stephen@networkplumber.org;
> > barbette@kth.se
> > Subject: [PATCH v3 0/5] fix vector rx burst for PMDs
> >
> > The limitation of burst size in vector rx was removed, since it should
> > retrieve as much received packets as possible. And also the scattered
> > receive path should use a wrapper function to achieve the goal of
> > burst maximizing.
> >
> > This patch set aims to maximize vector rx burst for for
> > ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.
> 
> IMO, this can be "enhance the vector ...", but not "fix ...".
> If "fix", it needs "Fixes:" tags ... ;-)
> 

Ok, so I will prefer to use "refine the vector rx burst for PMDs", thanks.
> >
> > --
> > 2.20.1
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-09-09  7:03     ` Guo, Jia
@ 2020-09-09  7:05       ` Wang, Haiyue
  2020-09-09  7:43         ` Morten Brørup
  0 siblings, 1 reply; 48+ messages in thread
From: Wang, Haiyue @ 2020-09-09  7:05 UTC (permalink / raw)
  To: Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1, Wei, Zhang, Qi Z,
	Wu, Jingjing
  Cc: Richardson, Bruce, dev, Zhang, Helin, mb, Yigit, Ferruh, stephen,
	barbette

> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Wednesday, September 9, 2020 15:03
> To: Wang, Haiyue <haiyue.wang@intel.com>; Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>;
> mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> barbette@kth.se
> Subject: RE: [PATCH v3 0/5] fix vector rx burst for PMDs
> 
> Hi, haiyue
> 
> > -----Original Message-----
> > From: Wang, Haiyue <haiyue.wang@intel.com>
> > Sent: Wednesday, September 9, 2020 2:45 PM
> > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>;
> > Xing, Beilei <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>;
> > Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> > Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang,
> > Helin <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se
> > Subject: RE: [PATCH v3 0/5] fix vector rx burst for PMDs
> >
> > > -----Original Message-----
> > > From: Guo, Jia <jia.guo@intel.com>
> > > Sent: Wednesday, September 9, 2020 14:37
> > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z
> > > <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> > > Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Guo,
> > > Jia <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > > Wang, Haiyue <haiyue.wang@intel.com>; stephen@networkplumber.org;
> > > barbette@kth.se
> > > Subject: [PATCH v3 0/5] fix vector rx burst for PMDs
> > >
> > > The limitation of burst size in vector rx was removed, since it should
> > > retrieve as much received packets as possible. And also the scattered
> > > receive path should use a wrapper function to achieve the goal of
> > > burst maximizing.
> > >
> > > This patch set aims to maximize vector rx burst for for
> > > ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.
> >
> > IMO, this can be "enhance the vector ...", but not "fix ...".
> > If "fix", it needs "Fixes:" tags ... ;-)
> >
> 
> Ok, so I will prefer to use "refine the vector rx burst for PMDs", thanks.

Better than "fix" now.

> > >
> > > --
> > > 2.20.1
> >
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-09-09  7:05       ` Wang, Haiyue
@ 2020-09-09  7:43         ` Morten Brørup
  2020-09-09  7:55           ` Wang, Haiyue
  0 siblings, 1 reply; 48+ messages in thread
From: Morten Brørup @ 2020-09-09  7:43 UTC (permalink / raw)
  To: Wang, Haiyue, Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1, Wei,
	Zhang, Qi Z, Wu, Jingjing
  Cc: Richardson, Bruce, dev, Zhang, Helin, Yigit, Ferruh, stephen, barbette

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Wang, Haiyue
> Sent: Wednesday, September 9, 2020 9:05 AM
> 
> > From: Guo, Jia <jia.guo@intel.com>
> > Sent: Wednesday, September 9, 2020 15:03
> >
> > Hi, haiyue
> >
> > > From: Wang, Haiyue <haiyue.wang@intel.com>
> > > Sent: Wednesday, September 9, 2020 2:45 PM
> > >
> > > > From: Guo, Jia <jia.guo@intel.com>
> > > > Sent: Wednesday, September 9, 2020 14:37
> > > >
> > > > The limitation of burst size in vector rx was removed, since it
> should
> > > > retrieve as much received packets as possible. And also the
> scattered
> > > > receive path should use a wrapper function to achieve the goal of
> > > > burst maximizing.
> > > >
> > > > This patch set aims to maximize vector rx burst for for
> > > > ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.
> > >
> > > IMO, this can be "enhance the vector ...", but not "fix ...".
> > > If "fix", it needs "Fixes:" tags ... ;-)
> > >
> >
> > Ok, so I will prefer to use "refine the vector rx burst for PMDs",
> thanks.
> 
> Better than "fix" now.
> 

It does fix a bug: https://bugs.dpdk.org/show_bug.cgi?id=516


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-09-09  7:43         ` Morten Brørup
@ 2020-09-09  7:55           ` Wang, Haiyue
  2020-09-09  8:01             ` Guo, Jia
  0 siblings, 1 reply; 48+ messages in thread
From: Wang, Haiyue @ 2020-09-09  7:55 UTC (permalink / raw)
  To: Morten Brørup, Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1,
	Wei, Zhang, Qi Z, Wu, Jingjing
  Cc: Richardson, Bruce, dev, Zhang, Helin, Yigit, Ferruh, stephen, barbette

> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: Wednesday, September 9, 2020 15:43
> To: Wang, Haiyue <haiyue.wang@intel.com>; Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>;
> Yigit, Ferruh <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se
> Subject: RE: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Wang, Haiyue
> > Sent: Wednesday, September 9, 2020 9:05 AM
> >
> > > From: Guo, Jia <jia.guo@intel.com>
> > > Sent: Wednesday, September 9, 2020 15:03
> > >
> > > Hi, haiyue
> > >
> > > > From: Wang, Haiyue <haiyue.wang@intel.com>
> > > > Sent: Wednesday, September 9, 2020 2:45 PM
> > > >
> > > > > From: Guo, Jia <jia.guo@intel.com>
> > > > > Sent: Wednesday, September 9, 2020 14:37
> > > > >
> > > > > The limitation of burst size in vector rx was removed, since it
> > should
> > > > > retrieve as much received packets as possible. And also the
> > scattered
> > > > > receive path should use a wrapper function to achieve the goal of
> > > > > burst maximizing.
> > > > >
> > > > > This patch set aims to maximize vector rx burst for for
> > > > > ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.
> > > >
> > > > IMO, this can be "enhance the vector ...", but not "fix ...".
> > > > If "fix", it needs "Fixes:" tags ... ;-)
> > > >
> > >
> > > Ok, so I will prefer to use "refine the vector rx burst for PMDs",
> > thanks.
> >
> > Better than "fix" now.
> >
> 
> It does fix a bug: https://bugs.dpdk.org/show_bug.cgi?id=516

Looks like a backport is needed.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
  2020-09-09  7:55           ` Wang, Haiyue
@ 2020-09-09  8:01             ` Guo, Jia
  0 siblings, 0 replies; 48+ messages in thread
From: Guo, Jia @ 2020-09-09  8:01 UTC (permalink / raw)
  To: Wang, Haiyue, Morten Brørup, Yang, Qiming, Xing, Beilei,
	Zhao1, Wei, Zhang, Qi Z, Wu, Jingjing
  Cc: Richardson, Bruce, dev, Zhang, Helin, Yigit, Ferruh, stephen, barbette

> -----Original Message-----
> From: Wang, Haiyue <haiyue.wang@intel.com>
> Sent: Wednesday, September 9, 2020 3:55 PM
> To: Morten Brørup <mb@smartsharesystems.com>; Guo, Jia
> <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang,
> Helin <helin.zhang@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>;
> stephen@networkplumber.org; barbette@kth.se
> Subject: RE: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
> 
> > -----Original Message-----
> > From: Morten Brørup <mb@smartsharesystems.com>
> > Sent: Wednesday, September 9, 2020 15:43
> > To: Wang, Haiyue <haiyue.wang@intel.com>; Guo, Jia
> > <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>; Xing,
> > Beilei <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>;
> > Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing
> > <jingjing.wu@intel.com>
> > Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org;
> > Zhang, Helin <helin.zhang@intel.com>; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> barbette@kth.se
> > Subject: RE: [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs
> >
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Wang, Haiyue
> > > Sent: Wednesday, September 9, 2020 9:05 AM
> > >
> > > > From: Guo, Jia <jia.guo@intel.com>
> > > > Sent: Wednesday, September 9, 2020 15:03
> > > >
> > > > Hi, haiyue
> > > >
> > > > > From: Wang, Haiyue <haiyue.wang@intel.com>
> > > > > Sent: Wednesday, September 9, 2020 2:45 PM
> > > > >
> > > > > > From: Guo, Jia <jia.guo@intel.com>
> > > > > > Sent: Wednesday, September 9, 2020 14:37
> > > > > >
> > > > > > The limitation of burst size in vector rx was removed, since
> > > > > > it
> > > should
> > > > > > retrieve as much received packets as possible. And also the
> > > scattered
> > > > > > receive path should use a wrapper function to achieve the goal
> > > > > > of burst maximizing.
> > > > > >
> > > > > > This patch set aims to maximize vector rx burst for for
> > > > > > ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.
> > > > >
> > > > > IMO, this can be "enhance the vector ...", but not "fix ...".
> > > > > If "fix", it needs "Fixes:" tags ... ;-)
> > > > >
> > > >
> > > > Ok, so I will prefer to use "refine the vector rx burst for PMDs",
> > > thanks.
> > >
> > > Better than "fix" now.
> > >
> >
> > It does fix a bug: https://bugs.dpdk.org/show_bug.cgi?id=516
> 

Oh, I missing it, so the Fixed tag and bug info could be expect in commit log in next version. Thanks, morten.

> Looks like a backport is needed.
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] 回复:  [PATCH v3 2/5] net/ixgbe: fix vector rx burst for ixgbe
       [not found]     ` <VI1PR0802MB23518C6B517B6EAD8E018CD49E260@VI1PR0802MB2351.eurprd08.prod.outlook.com>
@ 2020-09-09  9:54       ` " Feifei Wang
  0 siblings, 0 replies; 48+ messages in thread
From: Feifei Wang @ 2020-09-09  9:54 UTC (permalink / raw)
  To: Jeff Guo <jia.guo@intel.com>; <Jeff Guo,
	qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
  Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit,
	haiyue.wang, stephen, barbette, nd, nd

Hi, Jeff

> From: dev <dev-bounces@dpdk.org> On Behalf Of Jeff Guo
> Sent: Wednesday, September 9, 2020 2:37 PM
> To: qiming.yang@intel.com; beilei.xing@intel.com; wei.zhao1@intel.com;
> qi.z.zhang@intel.com; jingjing.wu@intel.com
> Cc: bruce.richardson@intel.com; dev@dpdk.org; jia.guo@intel.com;
> helin.zhang@intel.com; mb@smartsharesystems.com; ferruh.yigit@intel.com;
> haiyue.wang@intel.com; stephen@networkplumber.org; barbette@kth.se
> Subject: [dpdk-dev] [PATCH v3 2/5] net/ixgbe: fix vector rx burst for ixgbe
> 
> The limitation of burst size in vector rx was removed, since it should retrieve as
> much received packets as possible. And also the scattered receive path should
> use a wrapper function to achieve the goal of burst maximizing. And do some
> code cleaning for vector rx path.
> 
> Signed-off-by: Jeff Guo <jia.guo@intel.com>
> ---
>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 77 +++++++++++++------------
This patch has passed the test on aarch64 platform with neon path.
Tested-by: Feifei Wang <Feifei.wang2@arm.com>
> drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c  | 61 +++++++++++---------
>  2 files changed, 76 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> index aa27ee177..7692c5d59 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> @@ -130,17 +130,6 @@ desc_to_olflags_v(uint8x16x2_t sterr_tmp1,
> uint8x16x2_t sterr_tmp2,  rx_pkts[3]->ol_flags = vol.e[3];  }
> 
> -/*
> - * vPMD raw receive routine, only accept(nb_pkts >=
> RTE_IXGBE_DESCS_PER_LOOP)
> - *
> - * Notice:
> - * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan
> RTE_IXGBE_MAX_RX_BURST
> - *   numbers of DD bit
> - * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> - * - don't support ol_flags for rss and csum err
> - */
> -
>  #define IXGBE_VPMD_DESC_EOP_MASK0x02020202
>  #define IXGBE_UINT8_BIT(CHAR_BIT * sizeof(uint8_t))
> 
> @@ -206,6 +195,13 @@ desc_to_ptype_v(uint64x2_t descs[4], uint16_t
> pkt_type_mask,  vgetq_lane_u32(tunnel_check, 3));  }
> 
> +/**
> + * vPMD raw receive routine, only accept(nb_pkts >=
> +RTE_IXGBE_DESCS_PER_LOOP)
> + *
> + * Notice:
> + * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> + * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two  */
>  static inline uint16_t
>  _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>     uint16_t nb_pkts, uint8_t *split_packet) @@ -226,9 +222,6 @@
> _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
>   rxq->crc_len, 0, 0, 0};
> 
> -/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ -nb_pkts =
> RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
> -
>  /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */  nb_pkts
> = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
> 
> @@ -382,16 +375,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq,
> struct rte_mbuf **rx_pkts,  return nb_pkts_recd;  }
> 
> -/*
> - * vPMD receive routine, only accept(nb_pkts >=
> RTE_IXGBE_DESCS_PER_LOOP)
> - *
> - * Notice:
> - * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan
> RTE_IXGBE_MAX_RX_BURST
> - *   numbers of DD bit
> - * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> - * - don't support ol_flags for rss and csum err
> - */
>  uint16_t
>  ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,  uint16_t
> nb_pkts) @@ -399,23 +382,19 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct
> rte_mbuf **rx_pkts,  return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts,
> NULL);  }
> 
> -/*
> - * vPMD receive routine that reassembles scattered packets
> - *
> - * Notice:
> - * - don't support ol_flags for rss and csum err
> - * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan
> RTE_IXGBE_MAX_RX_BURST
> - *   numbers of DD bit
> - * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> +/**
> + * vPMD receive routine that reassembles single burst of 32 scattered
> +packets
>   */
> -uint16_t
> -ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, -
> uint16_t nb_pkts)
> +static uint16_t
> +ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +       uint16_t nb_pkts)
>  {
>  struct ixgbe_rx_queue *rxq = rx_queue;
>  uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
> 
> +/* split_flags only can support max of RTE_IXGBE_MAX_RX_BURST */
> +nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
> +
>  /* get some new buffers */
>  uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,  split_flags);
> @@ -443,6 +422,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct
> rte_mbuf **rx_pkts,  &split_flags[i]);  }
> 
> +/**
> + * vPMD receive routine that reassembles scattered packets.
> + */
> +uint16_t
> +ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +      uint16_t nb_pkts)
> +{
> +uint16_t retval = 0;
> +
> +while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) { uint16_t burst;
> +
> +burst = ixgbe_recv_scattered_burst_vec(rx_queue,
> +       rx_pkts + retval,
> +       RTE_IXGBE_MAX_RX_BURST);
> +retval += burst;
> +nb_pkts -= burst;
> +if (burst < RTE_IXGBE_MAX_RX_BURST)
> +return retval;
> +}
> +
> +return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
> +       rx_pkts + retval,
> +       nb_pkts);
> +}
> +
>  static inline void
>  vtx1(volatile union ixgbe_adv_tx_desc *txdp,  struct rte_mbuf *pkt, uint64_t
> flags) diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> index 517ca3166..cf54ff128 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> @@ -302,13 +302,11 @@ desc_to_ptype_v(__m128i descs[4], uint16_t
> pkt_type_mask,  get_packet_type(3, pkt_info, etqf_check, tunnel_check);  }
> 
> -/*
> +/**
>   * vPMD raw receive routine, only accept(nb_pkts >=
> RTE_IXGBE_DESCS_PER_LOOP)
>   *
>   * Notice:
>   * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan
> RTE_IXGBE_MAX_RX_BURST
> - *   numbers of DD bit
>   * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
>   */
>  static inline uint16_t
> @@ -344,9 +342,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct
> rte_mbuf **rx_pkts,  __m128i mbuf_init;  uint8_t vlan_flags;
> 
> -/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ -nb_pkts =
> RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
> -
>  /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */  nb_pkts
> = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
> 
> @@ -556,15 +551,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq,
> struct rte_mbuf **rx_pkts,  return nb_pkts_recd;  }
> 
> -/*
> - * vPMD receive routine, only accept(nb_pkts >=
> RTE_IXGBE_DESCS_PER_LOOP)
> - *
> - * Notice:
> - * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan
> RTE_IXGBE_MAX_RX_BURST
> - *   numbers of DD bit
> - * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> - */
>  uint16_t
>  ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,  uint16_t
> nb_pkts) @@ -572,22 +558,19 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct
> rte_mbuf **rx_pkts,  return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts,
> NULL);  }
> 
> -/*
> - * vPMD receive routine that reassembles scattered packets
> - *
> - * Notice:
> - * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan
> RTE_IXGBE_MAX_RX_BURST
> - *   numbers of DD bit
> - * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> +/**
> + * vPMD receive routine that reassembles single burst of 32 scattered
> +packets
>   */
> -uint16_t
> -ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, -
> uint16_t nb_pkts)
> +static uint16_t
> +ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +       uint16_t nb_pkts)
>  {
>  struct ixgbe_rx_queue *rxq = rx_queue;
>  uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
> 
> +/* split_flags only can support max of RTE_IXGBE_MAX_RX_BURST */
> +nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
> +
>  /* get some new buffers */
>  uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,  split_flags);
> @@ -615,6 +598,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct
> rte_mbuf **rx_pkts,  &split_flags[i]);  }
> 
> +/**
> + * vPMD receive routine that reassembles scattered packets.
> + */
> +uint16_t
> +ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +      uint16_t nb_pkts)
> +{
> +uint16_t retval = 0;
> +
> +while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) { uint16_t burst;
> +
> +burst = ixgbe_recv_scattered_burst_vec(rx_queue,
> +       rx_pkts + retval,
> +       RTE_IXGBE_MAX_RX_BURST);
> +retval += burst;
> +nb_pkts -= burst;
> +if (burst < RTE_IXGBE_MAX_RX_BURST)
> +return retval;
> +}
> +
> +return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
> +       rx_pkts + retval,
> +       nb_pkts);
> +}
> +
>  static inline void
>  vtx1(volatile union ixgbe_adv_tx_desc *txdp,  struct rte_mbuf *pkt, uint64_t
> flags)
> --
> 2.20.1
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice
  2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice Jeff Guo
@ 2020-09-15  7:10     ` Han, YingyaX
  0 siblings, 0 replies; 48+ messages in thread
From: Han, YingyaX @ 2020-09-15  7:10 UTC (permalink / raw)
  To: Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1, Wei, Zhang, Qi Z,
	Wu, Jingjing
  Cc: Richardson, Bruce, dev, Guo, Jia, Zhang, Helin, mb, Yigit,
	Ferruh, Wang, Haiyue, stephen, barbette

Tested-by: Yingya Han <yingyax.han@intel.com>

-----Original Message-----
From: dev <dev-bounces@dpdk.org> On Behalf Of Jeff Guo
Sent: Wednesday, September 9, 2020 2:37 PM
To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>; stephen@networkplumber.org; barbette@kth.se
Subject: [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice

The limitation of burst size in vector rx was removed, since it should retrieve as much received packets as possible. And also the scattered receive path should use a wrapper function to achieve the goal of burst maximizing. And do some code cleaning for vector rx path.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/ice/ice_rxtx.h          |  1 +
 drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------  drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h index 2fdcfb7d0..3ef5f300d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -35,6 +35,7 @@
 #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
 #define ICE_TX_MAX_FREE_BUF_SZ      64
 #define ICE_DESCS_PER_LOOP          4
+#define ICE_DESCS_PER_LOOP_AVX	    8
 
 #define ICE_FDIR_PKT_LEN	512
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..843e4f32a 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			__m128i dma_addr0;
 
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= 
+ICE_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define ICE_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		       uint16_t nb_pkts)
@@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, @@ -626,6 +625,9 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 						       split_flags);
@@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..c03e24092 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -205,10 +205,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,  }
 
 /**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, @@ -264,9 +265,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 
@@ -441,12 +439,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		  uint16_t nb_pkts)
@@ -454,19 +446,19 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);  }
 
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered 
+packets
  */
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
 {
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 						  split_flags);
@@ -496,6 +488,32 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 					     &split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     nb_pkts);
+}
+
 static inline void
 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
--
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v4 0/5] fix vector rx burst for PMDs
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
                   ` (5 preceding siblings ...)
  2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
@ 2020-09-17  7:58 ` " Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
                     ` (4 more replies)
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
  7 siblings, 5 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-17  7:58 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, qi.z.zhang, jingjing.wu, haiyue.wang
  Cc: wei.zhao1, bruce.richardson, dev, jia.guo, helin.zhang, mb,
	ferruh.yigit, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of burst
maximizing.

This patch set aims to maximize vector rx burst for for
ixgbe/i40e/ice/iavf/fm10k PMDs, and also do some code cleaning.

Bugzilla ID: 516

v4->v3:
add Fixes tag.

v3->v2:
1:move define into header file.
2:delete some useless doc.

v2->v1:
1:add fm10k driver case
2:refine some doc

Jeff Guo (5):
  net/iavf: fix vector rx burst for iavf
  net/ixgbe: fix vector rx burst for ixgbe
  net/i40e: fix vector rx burst for i40e
  net/ice: fix vector rx burst for ice
  net/fm10k: fix vector rx burst for fm10k

 drivers/net/fm10k/fm10k_rxtx_vec.c       |  42 ++++++--
 drivers/net/i40e/i40e_rxtx.h             |   1 +
 drivers/net/i40e/i40e_rxtx_vec_altivec.c |  64 +++++++-----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c    |  29 +++---
 drivers/net/i40e/i40e_rxtx_vec_neon.c    |  58 +++++++----
 drivers/net/i40e/i40e_rxtx_vec_sse.c     |  58 +++++++----
 drivers/net/iavf/iavf_rxtx.h             |   1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c    |  78 +++++++--------
 drivers/net/iavf/iavf_rxtx_vec_sse.c     | 119 +++++++++++++++--------
 drivers/net/ice/ice_rxtx.h               |   1 +
 drivers/net/ice/ice_rxtx_vec_avx2.c      |  23 +++--
 drivers/net/ice/ice_rxtx_vec_sse.c       |  56 +++++++----
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c  |  77 ++++++++-------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c   |  61 +++++++-----
 14 files changed, 412 insertions(+), 256 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v4 1/5] net/iavf: fix vector rx burst for iavf
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
@ 2020-09-17  7:58   ` Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 2/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-17  7:58 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, qi.z.zhang, jingjing.wu, haiyue.wang
  Cc: wei.zhao1, bruce.richardson, dev, jia.guo, helin.zhang, mb,
	ferruh.yigit, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Bugzilla ID: 516
Fixes: af0c246a3800 ("net/iavf: enable AVX2 for iavf")
Fixes: 319c421f3890 ("net/avf: enable SSE Rx Tx")
Fixes: 1162f5a0ef31 ("net/iavf: support flexible Rx descriptor in SSE path")
Fixes: 5b6e8859081d ("net/iavf: support flexible Rx descriptor in AVX path")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/iavf/iavf_rxtx.h          |   1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c |  78 ++++++++---------
 drivers/net/iavf/iavf_rxtx_vec_sse.c  | 119 ++++++++++++++++++--------
 3 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 59625a979..f71f9fbdb 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -21,6 +21,7 @@
 #define IAVF_VPMD_TX_MAX_BURST    32
 #define IAVF_RXQ_REARM_THRESH     32
 #define IAVF_VPMD_DESCS_PER_LOOP  4
+#define IAVF_VPMD_DESCS_PER_LOOP_AVX  8
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
 #define IAVF_NO_VECTOR_FLAGS (				 \
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index e5e0fd309..c6ca5a4a8 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -29,7 +29,7 @@ iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 			__m128i dma_addr0;
 
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP_AVX; i++) {
 				rxp[i] = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -134,13 +134,19 @@ iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 
 #define PKTLEN_SHIFT     10
 
+/**
+ * vPMD raw receive routine,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 			     struct rte_mbuf **rx_pkts,
 			     uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define IAVF_DESCS_PER_LOOP_AVX 8
-
 	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
 
@@ -153,8 +159,8 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 
 	rte_prefetch0(rxdp);
 
-	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP_AVX);
 
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
@@ -297,8 +303,8 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
-	     i += IAVF_DESCS_PER_LOOP_AVX,
-	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+	     i += IAVF_VPMD_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_VPMD_DESCS_PER_LOOP_AVX) {
 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
 		_mm256_storeu_si256((void *)&rx_pkts[i],
 				    _mm256_loadu_si256((void *)&sw_ring[i]));
@@ -368,7 +374,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 		if (split_packet) {
 			int j;
 
-			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+			for (j = 0; j < IAVF_VPMD_DESCS_PER_LOOP_AVX; j++)
 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
 		}
 
@@ -583,7 +589,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
 			*(uint64_t *)split_packet =
 				_mm_cvtsi128_si64(split_bits);
-			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP_AVX;
 		}
 
 		/* perform dd_check */
@@ -599,7 +605,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 				(_mm_cvtsi128_si64
 					(_mm256_castsi256_si128(status0_7)));
 		received += burst;
-		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+		if (burst != IAVF_VPMD_DESCS_PER_LOOP_AVX)
 			break;
 	}
 
@@ -633,13 +639,19 @@ flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7)
 	return fdir_flags;
 }
 
+/**
+ * vPMD raw receive routine  for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 				      struct rte_mbuf **rx_pkts,
 				      uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define IAVF_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
@@ -650,8 +662,8 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 
 	rte_prefetch0(rxdp);
 
-	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP_AVX);
 
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
@@ -794,8 +806,8 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
-	     i += IAVF_DESCS_PER_LOOP_AVX,
-	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+	     i += IAVF_VPMD_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_VPMD_DESCS_PER_LOOP_AVX) {
 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
 		_mm256_storeu_si256((void *)&rx_pkts[i],
 				    _mm256_loadu_si256((void *)&sw_ring[i]));
@@ -851,7 +863,7 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 		if (split_packet) {
 			int j;
 
-			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+			for (j = 0; j < IAVF_VPMD_DESCS_PER_LOOP_AVX; j++)
 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
 		}
 
@@ -1193,7 +1205,7 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
 			*(uint64_t *)split_packet =
 				_mm_cvtsi128_si64(split_bits);
-			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP_AVX;
 		}
 
 		/* perform dd_check */
@@ -1209,7 +1221,7 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 				(_mm_cvtsi128_si64
 					(_mm256_castsi256_si128(status0_7)));
 		received += burst;
-		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+		if (burst != IAVF_VPMD_DESCS_PER_LOOP_AVX)
 			break;
 	}
 
@@ -1224,10 +1236,6 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	return received;
 }
 
-/**
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts)
@@ -1235,10 +1243,6 @@ iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/**
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts)
@@ -1249,8 +1253,6 @@ iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -1259,6 +1261,9 @@ iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 						       split_flags);
@@ -1290,9 +1295,6 @@ iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -1313,10 +1315,8 @@ iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 }
 
 /**
- * vPMD receive routine that reassembles single burst of
- * 32 scattered packets for flex RxD
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
  */
 static uint16_t
 iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
@@ -1326,6 +1326,9 @@ iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq,
 					rx_pkts, nb_pkts, split_flags);
@@ -1357,9 +1360,6 @@ iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
 
 /**
  * vPMD receive routine that reassembles scattered packets for flex RxD.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 85c5bd4af..b5362ecf3 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -379,10 +379,12 @@ flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)];
 }
 
-/* Notice:
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -413,9 +415,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -627,10 +626,13 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/* Notice:
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
@@ -688,9 +690,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -921,11 +920,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	return nb_pkts_recd;
 }
 
-/* Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
- */
 uint16_t
 iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		  uint16_t nb_pkts)
@@ -933,11 +927,6 @@ iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
- */
 uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
@@ -945,20 +934,20 @@ iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -986,21 +975,48 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/* vPMD receive routine that reassembles scattered packets for flex RxD
- * Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets.
  */
 uint16_t
-iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
-				      struct rte_mbuf **rx_pkts,
-				      uint16_t nb_pkts)
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	/* split_flags only can support max of IAVF_VPMD_RX_MAX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1028,6 +1044,33 @@ iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v4 2/5] net/ixgbe: fix vector rx burst for ixgbe
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
@ 2020-09-17  7:58   ` Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 3/5] net/i40e: fix vector rx burst for i40e Jeff Guo
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-17  7:58 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, qi.z.zhang, jingjing.wu, haiyue.wang
  Cc: wei.zhao1, bruce.richardson, dev, jia.guo, helin.zhang, mb,
	ferruh.yigit, stephen, barbette, Feifei Wang

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Bugzilla ID: 516
Fixes: b20971b6cca0 ("net/ixgbe: implement vector driver for ARM")
Fixes: 0e51f9dc4860 ("net/ixgbe: rename x86 vector driver file")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Tested-by: Feifei Wang <Feifei.wang2@arm.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 77 +++++++++++++------------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c  | 61 +++++++++++---------
 2 files changed, 76 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index aa27ee177..7692c5d59 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -130,17 +130,6 @@ desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
 	rx_pkts[3]->ol_flags = vol.e[3];
 }
 
-/*
- * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
-
 #define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
 #define IXGBE_UINT8_BIT			(CHAR_BIT * sizeof(uint8_t))
 
@@ -206,6 +195,13 @@ desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,
 				vgetq_lane_u32(tunnel_check, 3));
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -226,9 +222,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
 				 rxq->crc_len, 0, 0, 0};
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -382,16 +375,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
- * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
 uint16_t
 ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts)
@@ -399,23 +382,19 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
- *
- * Notice:
- * - don't support ol_flags for rss and csum err
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -443,6 +422,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca3166..cf54ff128 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -302,13 +302,11 @@ desc_to_ptype_v(__m128i descs[4], uint16_t pkt_type_mask,
 		get_packet_type(3, pkt_info, etqf_check, tunnel_check);
 }
 
-/*
+/**
  * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
 static inline uint16_t
@@ -344,9 +342,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	__m128i mbuf_init;
 	uint8_t vlan_flags;
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -556,15 +551,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
- * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- */
 uint16_t
 ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts)
@@ -572,22 +558,19 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -615,6 +598,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v4 3/5] net/i40e: fix vector rx burst for i40e
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 2/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
@ 2020-09-17  7:58   ` Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice Jeff Guo
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 5/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
  4 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-17  7:58 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, qi.z.zhang, jingjing.wu, haiyue.wang
  Cc: wei.zhao1, bruce.richardson, dev, jia.guo, helin.zhang, mb,
	ferruh.yigit, stephen, barbette

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Bugzilla ID: 516
Fixes: 5b463eda8d26 ("net/i40e: make vector driver filenames consistent")
Fixes: ae0eb310f253 ("net/i40e: implement vector PMD for ARM")
Fixes: dafadd73762e ("net/i40e: add AVX2 Rx function")
Fixes: c3def6a8724c ("net/i40e: implement vector PMD for altivec")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/i40e/i40e_rxtx.h             |  1 +
 drivers/net/i40e/i40e_rxtx_vec_altivec.c | 64 ++++++++++++++++--------
 drivers/net/i40e/i40e_rxtx_vec_avx2.c    | 29 ++++++-----
 drivers/net/i40e/i40e_rxtx_vec_neon.c    | 58 +++++++++++++--------
 drivers/net/i40e/i40e_rxtx_vec_sse.c     | 58 +++++++++++++--------
 5 files changed, 133 insertions(+), 77 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 57d7b4160..01d4609f9 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -14,6 +14,7 @@
 #define RTE_I40E_MAX_RX_BURST          RTE_I40E_RXQ_REARM_THRESH
 #define RTE_I40E_TX_MAX_FREE_BUF_SZ    64
 #define RTE_I40E_DESCS_PER_LOOP    4
+#define RTE_I40E_DESCS_PER_LOOP_AVX    8
 
 #define I40E_RXBUF_SZ_1024 1024
 #define I40E_RXBUF_SZ_2048 2048
diff --git a/drivers/net/i40e/i40e_rxtx_vec_altivec.c b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
index 6862a017e..345c63aa7 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_altivec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
@@ -188,11 +188,13 @@ desc_to_ptype_v(vector unsigned long descs[4], struct rte_mbuf **rx_pkts,
 		ptype_tbl[(*(vector unsigned char *)&ptype1)[8]];
 }
 
- /* Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -214,9 +216,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		};
 	vector unsigned long dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -447,11 +446,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
- /* Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
 uint16_t
 i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -459,19 +453,19 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
-  * Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -500,6 +494,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 	struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef1363..b5e6867d0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -36,7 +36,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 		    rxq->nb_rx_desc) {
 			__m128i dma_addr0;
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP_AVX; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -219,13 +219,18 @@ desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
 
 #define PKTLEN_SHIFT     10
 
-/* Force inline as some compilers will not inline by default. */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP_AVX power-of-two
+ * - force inline as some compilers will not inline by default
+ */
 static __rte_always_inline uint16_t
 _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define RTE_I40E_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -729,10 +734,6 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -740,10 +741,8 @@ i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -752,6 +751,9 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -781,11 +783,8 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 i40e_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 6f874e45b..143cdf4a5 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -187,11 +187,12 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 
 }
 
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
@@ -230,9 +231,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 		0, 0, 0       /* ignore non-length fields */
 		};
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -426,12 +424,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 	return nb_pkts_recd;
 }
 
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 		struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
@@ -439,20 +431,20 @@ i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -482,6 +474,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..605912246 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -342,11 +342,12 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
 }
 
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -378,9 +379,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -592,12 +590,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts)
@@ -605,20 +597,20 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -648,6 +640,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
                     ` (2 preceding siblings ...)
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 3/5] net/i40e: fix vector rx burst for i40e Jeff Guo
@ 2020-09-17  7:58   ` Jeff Guo
  2020-09-17 11:03     ` Zhang, Qi Z
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 5/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
  4 siblings, 1 reply; 48+ messages in thread
From: Jeff Guo @ 2020-09-17  7:58 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, qi.z.zhang, jingjing.wu, haiyue.wang
  Cc: wei.zhao1, bruce.richardson, dev, jia.guo, helin.zhang, mb,
	ferruh.yigit, stephen, barbette, Yingya Han

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Bugzilla ID: 516
Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Tested-by: Yingya Han <yingyax.han@intel.com>
---
 drivers/net/ice/ice_rxtx.h          |  1 +
 drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
 drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 2fdcfb7d0..3ef5f300d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -35,6 +35,7 @@
 #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
 #define ICE_TX_MAX_FREE_BUF_SZ      64
 #define ICE_DESCS_PER_LOOP          4
+#define ICE_DESCS_PER_LOOP_AVX	    8
 
 #define ICE_FDIR_PKT_LEN	512
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..843e4f32a 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			__m128i dma_addr0;
 
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define ICE_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		       uint16_t nb_pkts)
@@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -626,6 +625,9 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 						       split_flags);
@@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..c03e24092 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -205,10 +205,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 }
 
 /**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -264,9 +265,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 
@@ -441,12 +439,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		  uint16_t nb_pkts)
@@ -454,19 +446,19 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
 {
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 						  split_flags);
@@ -496,6 +488,32 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 					     &split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     nb_pkts);
+}
+
 static inline void
 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v4 5/5] net/fm10k: fix vector rx burst for fm10k
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
                     ` (3 preceding siblings ...)
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice Jeff Guo
@ 2020-09-17  7:58   ` Jeff Guo
  4 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-09-17  7:58 UTC (permalink / raw)
  To: qiming.yang, beilei.xing, qi.z.zhang, jingjing.wu, haiyue.wang
  Cc: wei.zhao1, bruce.richardson, dev, jia.guo, helin.zhang, mb,
	ferruh.yigit, stephen, barbette

The scattered receive path should use a wrapper function to achieve the
goal of burst maximizing. And do some code cleaning for vector rx path.

Bugzilla ID: 516
Fixes: fe65e1e1ce61 ("fm10k: add vector scatter Rx")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c | 42 +++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index eff3933b5..3b25c570b 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -645,25 +645,23 @@ fm10k_reassemble_packets(struct fm10k_rx_queue *rxq,
 	return pkt_idx;
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  *
  * Notice:
  * - don't support ol_flags for rss and csum err
- * - nb_pkts > RTE_FM10K_MAX_RX_BURST, only scan RTE_FM10K_MAX_RX_BURST
- *   numbers of DD bit
  */
-uint16_t
-fm10k_recv_scattered_pkts_vec(void *rx_queue,
-				struct rte_mbuf **rx_pkts,
-				uint16_t nb_pkts)
+static uint16_t
+fm10k_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct fm10k_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_FM10K_MAX_RX_BURST] = {0};
 	unsigned i = 0;
 
-	/* Split_flags only can support max of RTE_FM10K_MAX_RX_BURST */
+	/* split_flags only can support max of RTE_FM10K_MAX_RX_BURST */
 	nb_pkts = RTE_MIN(nb_pkts, RTE_FM10K_MAX_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = fm10k_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 			split_flags);
@@ -691,6 +689,32 @@ fm10k_recv_scattered_pkts_vec(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+fm10k_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_FM10K_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = fm10k_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_FM10K_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_FM10K_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + fm10k_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static const struct fm10k_txq_ops vec_txq_ops = {
 	.reset = fm10k_reset_tx_queue,
 };
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice
  2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice Jeff Guo
@ 2020-09-17 11:03     ` Zhang, Qi Z
  2020-09-18  3:20       ` Guo, Jia
  0 siblings, 1 reply; 48+ messages in thread
From: Zhang, Qi Z @ 2020-09-17 11:03 UTC (permalink / raw)
  To: Guo, Jia, Yang, Qiming, Xing, Beilei, Wu, Jingjing, Wang, Haiyue
  Cc: Zhao1, Wei, Richardson, Bruce, dev, Zhang, Helin, mb, Yigit,
	Ferruh, stephen, barbette, Han, YingyaX



> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Thursday, September 17, 2020 3:59 PM
> To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia <jia.guo@intel.com>;
> Zhang, Helin <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit,
> Ferruh <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> The limitation of burst size in vector rx was removed, since it should retrieve as
> much received packets as possible. And also the scattered receive path should
> use a wrapper function to achieve the goal of burst maximizing. And do some
> code cleaning for vector rx path.
> 
> Bugzilla ID: 516
> Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> 
> Signed-off-by: Jeff Guo <jia.guo@intel.com>
> Tested-by: Yingya Han <yingyax.han@intel.com>
> ---
>  drivers/net/ice/ice_rxtx.h          |  1 +
>  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
>  3 files changed, 49 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h index
> 2fdcfb7d0..3ef5f300d 100644
> --- a/drivers/net/ice/ice_rxtx.h
> +++ b/drivers/net/ice/ice_rxtx.h
> @@ -35,6 +35,7 @@
>  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
>  #define ICE_TX_MAX_FREE_BUF_SZ      64
>  #define ICE_DESCS_PER_LOOP          4
> +#define ICE_DESCS_PER_LOOP_AVX	    8

No need to expose this if no external link, better to keep all avx stuff inside avx.c

> 
>  #define ICE_FDIR_PKT_LEN	512
> 
> diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> b/drivers/net/ice/ice_rxtx_vec_avx2.c
> index be50677c2..843e4f32a 100644
> --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
>  			__m128i dma_addr0;
> 
>  			dma_addr0 = _mm_setzero_si128();
> -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
>  				rxep[i].mbuf = &rxq->fake_mbuf;
>  				_mm_store_si128((__m128i *)&rxdp[i].read,
>  						dma_addr0);
> @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
>  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> 
> +/**
> + * vPMD raw receive routine, only accept(nb_pkts >=
> +ICE_DESCS_PER_LOOP_AVX)
> + *
> + * Notice:
> + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two  */

The comment is misleading, it looks like we are going to floor align nb_pkts to 2^8, better to reword .

>  static inline uint16_t
>  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf
> **rx_pkts,
>  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> ICE_DESCS_PER_LOOP_AVX 8
> -
>  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
>  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
>  			0, rxq->mbuf_initializer);
> @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue
> *rxq, struct rte_mbuf **rx_pkts,
>  	return received;
>  }
> 
> -/*
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> - */
>  uint16_t
>  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		       uint16_t nb_pkts)
> @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> rte_mbuf **rx_pkts,
> 
>  /**
>   * vPMD receive routine that reassembles single burst of 32 scattered
> packets
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>   */

Why we need to remove this? is it still true for this function?

>  static uint16_t
>  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> **rx_pkts, @@ -626,6 +625,9 @@ ice_recv_scattered_burst_vec_avx2(void
> *rx_queue, struct rte_mbuf **rx_pkts,
>  	struct ice_rx_queue *rxq = rx_queue;
>  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> 
> +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);

Is this necessary?  the only consumer of this function is ice_recv_scattered_pkts_vec_avx2, 
I think nb_pkts <= ICE_VPMD_RX_BURST it already be guaranteed.
> +
>  	/* get some new buffers */
>  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
>  						       split_flags);
> @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue,
> struct rte_mbuf **rx_pkts,
> 
>  /**
>   * vPMD receive routine that reassembles scattered packets.
> - * Main receive routine that can handle arbitrary burst sizes
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>   */

Why we need to remove this? isn't it the main routine that be able to handle arbitrary burst size?

Btw, I will suggest all AVX2 changes can be in a separate patch, because this looks like some code clean and fix.
its not related with the main purpose of the patch set.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice
  2020-09-17 11:03     ` Zhang, Qi Z
@ 2020-09-18  3:20       ` Guo, Jia
  2020-09-18  3:41         ` Zhang, Qi Z
  0 siblings, 1 reply; 48+ messages in thread
From: Guo, Jia @ 2020-09-18  3:20 UTC (permalink / raw)
  To: Zhang, Qi Z, Yang, Qiming, Xing, Beilei, Wu, Jingjing, Wang, Haiyue
  Cc: Zhao1, Wei, Richardson, Bruce, dev, Zhang, Helin, mb, Yigit,
	Ferruh, stephen, barbette, Han, YingyaX

Hi, qi

> -----Original Message-----
> From: Zhang, Qi Z <qi.z.zhang@intel.com>
> Sent: Thursday, September 17, 2020 7:03 PM
> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>;
> Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> 
> 
> > -----Original Message-----
> > From: Guo, Jia <jia.guo@intel.com>
> > Sent: Thursday, September 17, 2020 3:59 PM
> > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > stephen@networkplumber.org; barbette@kth.se; Han, YingyaX
> > <yingyax.han@intel.com>
> > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> > The limitation of burst size in vector rx was removed, since it should
> > retrieve as much received packets as possible. And also the scattered
> > receive path should use a wrapper function to achieve the goal of
> > burst maximizing. And do some code cleaning for vector rx path.
> >
> > Bugzilla ID: 516
> > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> >
> > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > Tested-by: Yingya Han <yingyax.han@intel.com>
> > ---
> >  drivers/net/ice/ice_rxtx.h          |  1 +
> >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
> >  3 files changed, 49 insertions(+), 31 deletions(-)
> >
> > diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
> > index 2fdcfb7d0..3ef5f300d 100644
> > --- a/drivers/net/ice/ice_rxtx.h
> > +++ b/drivers/net/ice/ice_rxtx.h
> > @@ -35,6 +35,7 @@
> >  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> >  #define ICE_DESCS_PER_LOOP          4
> > +#define ICE_DESCS_PER_LOOP_AVX	    8
> 
> No need to expose this if no external link, better to keep all avx stuff inside
> avx.c
> 

Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.

> >
> >  #define ICE_FDIR_PKT_LEN	512
> >
> > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > index be50677c2..843e4f32a 100644
> > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> >  			__m128i dma_addr0;
> >
> >  			dma_addr0 = _mm_setzero_si128();
> > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> >  				rxep[i].mbuf = &rxq->fake_mbuf;
> >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> >  						dma_addr0);
> > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> >
> > +/**
> > + * vPMD raw receive routine, only accept(nb_pkts >=
> > +ICE_DESCS_PER_LOOP_AVX)
> > + *
> > + * Notice:
> > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two  */
> 
> The comment is misleading, it looks like we are going to floor align nb_pkts to
> 2^8, better to reword .
> 

It should be, agree.

> >  static inline uint16_t
> >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf
> > **rx_pkts,
> >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > ICE_DESCS_PER_LOOP_AVX 8
> > -
> >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> >  			0, rxq->mbuf_initializer);
> > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> ice_rx_queue
> > *rxq, struct rte_mbuf **rx_pkts,
> >  	return received;
> >  }
> >
> > -/*
> > - * Notice:
> > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > - */
> >  uint16_t
> >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> >  		       uint16_t nb_pkts)
> > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> > rte_mbuf **rx_pkts,
> >
> >  /**
> >   * vPMD receive routine that reassembles single burst of 32 scattered
> > packets
> > - * Notice:
> > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> >   */
> 
> Why we need to remove this? is it still true for this function?
> 

The reason is that this comment is in the calling function " _ice_recv_raw_pkts_vec_avx2" which process the related thing, no need to add it more and more in the caller function. 

> >  static uint16_t
> >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> > **rx_pkts, @@ -626,6 +625,9 @@
> ice_recv_scattered_burst_vec_avx2(void
> > *rx_queue, struct rte_mbuf **rx_pkts,
> >  	struct ice_rx_queue *rxq = rx_queue;
> >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> >
> > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> 
> Is this necessary?  the only consumer of this function is
> ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <=
> ICE_VPMD_RX_BURST it already be guaranteed.

The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in this function split_flags have a limit for ICE_VPMD_RX_BURST, so a checking is need in the function.

> > +
> >  	/* get some new buffers */
> >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> nb_pkts,
> >  						       split_flags);
> > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> *rx_queue,
> > struct rte_mbuf **rx_pkts,
> >
> >  /**
> >   * vPMD receive routine that reassembles scattered packets.
> > - * Main receive routine that can handle arbitrary burst sizes
> > - * Notice:
> > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> >   */
> 
> Why we need to remove this? isn't it the main routine that be able to handle
> arbitrary burst size?
> 

The question is why we need to said the arbitrary sizes if we process and return what we could receive packet for maximum? It is not only useless comment but also maybe bring some confuse I think. 

> Btw, I will suggest all AVX2 changes can be in a separate patch, because this
> looks like some code clean and fix.
> its not related with the main purpose of the patch set.

I consider it and ask any objection before, so totally I am not disagree on separate it, but I think if  the purpose of the patch set is to clean some misleading for vec(sse/avx) burst, it could still be on a set even separate it to patch. 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice
  2020-09-18  3:20       ` Guo, Jia
@ 2020-09-18  3:41         ` Zhang, Qi Z
  2020-09-18  4:41           ` Guo, Jia
  0 siblings, 1 reply; 48+ messages in thread
From: Zhang, Qi Z @ 2020-09-18  3:41 UTC (permalink / raw)
  To: Guo, Jia, Yang, Qiming, Xing, Beilei, Wu, Jingjing, Wang, Haiyue
  Cc: Zhao1, Wei, Richardson, Bruce, dev, Zhang, Helin, mb, Yigit,
	Ferruh, stephen, barbette, Han, YingyaX



> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Friday, September 18, 2020 11:20 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> Hi, qi
> 
> > -----Original Message-----
> > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Sent: Thursday, September 17, 2020 7:03 PM
> > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> > Han, YingyaX <yingyax.han@intel.com>
> > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> >
> >
> > > -----Original Message-----
> > > From: Guo, Jia <jia.guo@intel.com>
> > > Sent: Thursday, September 17, 2020 3:59 PM
> > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > <haiyue.wang@intel.com>
> > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > > stephen@networkplumber.org; barbette@kth.se; Han, YingyaX
> > > <yingyax.han@intel.com>
> > > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > >
> > > The limitation of burst size in vector rx was removed, since it
> > > should retrieve as much received packets as possible. And also the
> > > scattered receive path should use a wrapper function to achieve the
> > > goal of burst maximizing. And do some code cleaning for vector rx path.
> > >
> > > Bugzilla ID: 516
> > > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> > >
> > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > Tested-by: Yingya Han <yingyax.han@intel.com>
> > > ---
> > >  drivers/net/ice/ice_rxtx.h          |  1 +
> > >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > > drivers/net/ice/ice_rxtx_vec_sse.c  | 56
> > > +++++++++++++++++++----------
> > >  3 files changed, 49 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
> > > index 2fdcfb7d0..3ef5f300d 100644
> > > --- a/drivers/net/ice/ice_rxtx.h
> > > +++ b/drivers/net/ice/ice_rxtx.h
> > > @@ -35,6 +35,7 @@
> > >  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> > >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> > >  #define ICE_DESCS_PER_LOOP          4
> > > +#define ICE_DESCS_PER_LOOP_AVX	    8
> >
> > No need to expose this if no external link, better to keep all avx
> > stuff inside avx.c
> >
> 
> Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.
> 
> > >
> > >  #define ICE_FDIR_PKT_LEN	512
> > >
> > > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > index be50677c2..843e4f32a 100644
> > > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > >  			__m128i dma_addr0;
> > >
> > >  			dma_addr0 = _mm_setzero_si128();
> > > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> > >  				rxep[i].mbuf = &rxq->fake_mbuf;
> > >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> > >  						dma_addr0);
> > > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> > >
> > > +/**
> > > + * vPMD raw receive routine, only accept(nb_pkts >=
> > > +ICE_DESCS_PER_LOOP_AVX)
> > > + *
> > > + * Notice:
> > > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
> > > +*/
> >
> > The comment is misleading, it looks like we are going to floor align
> > nb_pkts to 2^8, better to reword .
> >
> 
> It should be, agree.
> 
> > >  static inline uint16_t
> > >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct
> > > rte_mbuf **rx_pkts,
> > >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > > ICE_DESCS_PER_LOOP_AVX 8
> > > -
> > >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> > >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> > >  			0, rxq->mbuf_initializer);
> > > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> > ice_rx_queue
> > > *rxq, struct rte_mbuf **rx_pkts,
> > >  	return received;
> > >  }
> > >
> > > -/*
> > > - * Notice:
> > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > - */
> > >  uint16_t
> > >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> > >  		       uint16_t nb_pkts)
> > > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> > > rte_mbuf **rx_pkts,
> > >
> > >  /**
> > >   * vPMD receive routine that reassembles single burst of 32
> > > scattered packets
> > > - * Notice:
> > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > >   */
> >
> > Why we need to remove this? is it still true for this function?
> >
> 
> The reason is that this comment is in the calling function "
> _ice_recv_raw_pkts_vec_avx2" which process the related thing, no need to
> add it more and more in the caller function.

I think you remove related comment from the calling function also :)

Also I think better to keep this even it's a little bit duplicate, that help people to understand the internal logic

> 
> > >  static uint16_t
> > >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> > > **rx_pkts, @@ -626,6 +625,9 @@
> > ice_recv_scattered_burst_vec_avx2(void
> > > *rx_queue, struct rte_mbuf **rx_pkts,
> > >  	struct ice_rx_queue *rxq = rx_queue;
> > >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> > >
> > > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> >
> > Is this necessary?  the only consumer of this function is
> > ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <= ICE_VPMD_RX_BURST
> > it already be guaranteed.
> 
> The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in this
> function split_flags have a limit for ICE_VPMD_RX_BURST, so a checking is
> need in the function.

Can't get this, could tell me is there any case that nb_pkts > ICE_VPMD_RX_BURST?


> 
> > > +
> > >  	/* get some new buffers */
> > >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> > nb_pkts,
> > >  						       split_flags);
> > > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> > *rx_queue,
> > > struct rte_mbuf **rx_pkts,
> > >
> > >  /**
> > >   * vPMD receive routine that reassembles scattered packets.
> > > - * Main receive routine that can handle arbitrary burst sizes
> > > - * Notice:
> > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > >   */
> >
> > Why we need to remove this? isn't it the main routine that be able to
> > handle arbitrary burst size?
> >
> 
> The question is why we need to said the arbitrary sizes if we process and return
> what we could receive packet for maximum? It is not only useless comment but
> also maybe bring some confuse I think.

Yes arbitrary size description can be removed, as this is assumed to be the default behavior.  
But the description for nb_pkts should still be kept.

> 
> > Btw, I will suggest all AVX2 changes can be in a separate patch,
> > because this looks like some code clean and fix.
> > its not related with the main purpose of the patch set.
> 
> I consider it and ask any objection before, so totally I am not disagree on
> separate it, but I think if  the purpose of the patch set is to clean some
> misleading for vec(sse/avx) burst, it could still be on a set even separate it to
> patch.

I will not be insist on patch separate, but if you separate them, some of fixes can be merged early and no need to wait for those part need more review.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice
  2020-09-18  3:41         ` Zhang, Qi Z
@ 2020-09-18  4:41           ` Guo, Jia
  2020-09-18  5:39             ` Zhang, Qi Z
  0 siblings, 1 reply; 48+ messages in thread
From: Guo, Jia @ 2020-09-18  4:41 UTC (permalink / raw)
  To: Zhang, Qi Z, Yang, Qiming, Xing, Beilei, Wu, Jingjing, Wang, Haiyue
  Cc: Zhao1, Wei, Richardson, Bruce, dev, Zhang, Helin, mb, Yigit,
	Ferruh, stephen, barbette, Han, YingyaX


> -----Original Message-----
> From: Zhang, Qi Z <qi.z.zhang@intel.com>
> Sent: Friday, September 18, 2020 11:41 AM
> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>;
> Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> 
> 
> > -----Original Message-----
> > From: Guo, Jia <jia.guo@intel.com>
> > Sent: Friday, September 18, 2020 11:20 AM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> barbette@kth.se;
> > Han, YingyaX <yingyax.han@intel.com>
> > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> > Hi, qi
> >
> > > -----Original Message-----
> > > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > Sent: Thursday, September 17, 2020 7:03 PM
> > > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > <haiyue.wang@intel.com>
> > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > > barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> > > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Guo, Jia <jia.guo@intel.com>
> > > > Sent: Thursday, September 17, 2020 3:59 PM
> > > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > > <haiyue.wang@intel.com>
> > > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > > > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > > > stephen@networkplumber.org; barbette@kth.se; Han, YingyaX
> > > > <yingyax.han@intel.com>
> > > > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > > >
> > > > The limitation of burst size in vector rx was removed, since it
> > > > should retrieve as much received packets as possible. And also the
> > > > scattered receive path should use a wrapper function to achieve
> > > > the goal of burst maximizing. And do some code cleaning for vector rx
> path.
> > > >
> > > > Bugzilla ID: 516
> > > > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > > > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> > > >
> > > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > > Tested-by: Yingya Han <yingyax.han@intel.com>
> > > > ---
> > > >  drivers/net/ice/ice_rxtx.h          |  1 +
> > > >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > > > drivers/net/ice/ice_rxtx_vec_sse.c  | 56
> > > > +++++++++++++++++++----------
> > > >  3 files changed, 49 insertions(+), 31 deletions(-)
> > > >
> > > > diff --git a/drivers/net/ice/ice_rxtx.h
> > > > b/drivers/net/ice/ice_rxtx.h index 2fdcfb7d0..3ef5f300d 100644
> > > > --- a/drivers/net/ice/ice_rxtx.h
> > > > +++ b/drivers/net/ice/ice_rxtx.h
> > > > @@ -35,6 +35,7 @@
> > > >  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> > > >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> > > >  #define ICE_DESCS_PER_LOOP          4
> > > > +#define ICE_DESCS_PER_LOOP_AVX	    8
> > >
> > > No need to expose this if no external link, better to keep all avx
> > > stuff inside avx.c
> > >
> >
> > Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.
> >
> > > >
> > > >  #define ICE_FDIR_PKT_LEN	512
> > > >
> > > > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > index be50677c2..843e4f32a 100644
> > > > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > >  			__m128i dma_addr0;
> > > >
> > > >  			dma_addr0 = _mm_setzero_si128();
> > > > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > > > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> > > >  				rxep[i].mbuf = &rxq->fake_mbuf;
> > > >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> > > >  						dma_addr0);
> > > > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> > > >
> > > > +/**
> > > > + * vPMD raw receive routine, only accept(nb_pkts >=
> > > > +ICE_DESCS_PER_LOOP_AVX)
> > > > + *
> > > > + * Notice:
> > > > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > > > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
> > > > +*/
> > >
> > > The comment is misleading, it looks like we are going to floor align
> > > nb_pkts to 2^8, better to reword .
> > >
> >
> > It should be, agree.
> >
> > > >  static inline uint16_t
> > > >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct
> > > > rte_mbuf **rx_pkts,
> > > >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > > > ICE_DESCS_PER_LOOP_AVX 8
> > > > -
> > > >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> > > >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> > > >  			0, rxq->mbuf_initializer);
> > > > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> > > ice_rx_queue
> > > > *rxq, struct rte_mbuf **rx_pkts,
> > > >  	return received;
> > > >  }
> > > >
> > > > -/*
> > > > - * Notice:
> > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > - */
> > > >  uint16_t
> > > >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> > > >  		       uint16_t nb_pkts)
> > > > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> > > > rte_mbuf **rx_pkts,
> > > >
> > > >  /**
> > > >   * vPMD receive routine that reassembles single burst of 32
> > > > scattered packets
> > > > - * Notice:
> > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > >   */
> > >
> > > Why we need to remove this? is it still true for this function?
> > >
> >
> > The reason is that this comment is in the calling function "
> > _ice_recv_raw_pkts_vec_avx2" which process the related thing, no need
> > to add it more and more in the caller function.
> 
> I think you remove related comment from the calling function also :)
> 
> Also I think better to keep this even it's a little bit duplicate, that help people
> to understand the internal logic
> 
> >
> > > >  static uint16_t
> > > >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> > > > **rx_pkts, @@ -626,6 +625,9 @@
> > > ice_recv_scattered_burst_vec_avx2(void
> > > > *rx_queue, struct rte_mbuf **rx_pkts,
> > > >  	struct ice_rx_queue *rxq = rx_queue;
> > > >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> > > >
> > > > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > > > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> > >
> > > Is this necessary?  the only consumer of this function is
> > > ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <=
> > > ICE_VPMD_RX_BURST it already be guaranteed.
> >
> > The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in
> > this function split_flags have a limit for ICE_VPMD_RX_BURST, so a
> > checking is need in the function.
> 
> Can't get this, could tell me is there any case that nb_pkts >
> ICE_VPMD_RX_BURST?
> 

I know we just set the hard value here and only one case usage, but I think only the caller know what would be the input param, but the calling should not know the input param will be, even there is no any caller but the calling still need to be complete.  

> 
> >
> > > > +
> > > >  	/* get some new buffers */
> > > >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> > > nb_pkts,
> > > >  						       split_flags);
> > > > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> > > *rx_queue,
> > > > struct rte_mbuf **rx_pkts,
> > > >
> > > >  /**
> > > >   * vPMD receive routine that reassembles scattered packets.
> > > > - * Main receive routine that can handle arbitrary burst sizes
> > > > - * Notice:
> > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > >   */
> > >
> > > Why we need to remove this? isn't it the main routine that be able
> > > to handle arbitrary burst size?
> > >
> >
> > The question is why we need to said the arbitrary sizes if we process
> > and return what we could receive packet for maximum? It is not only
> > useless comment but also maybe bring some confuse I think.
> 
> Yes arbitrary size description can be removed, as this is assumed to be the
> default behavior.
> But the description for nb_pkts should still be kept.
> 
> >
> > > Btw, I will suggest all AVX2 changes can be in a separate patch,
> > > because this looks like some code clean and fix.
> > > its not related with the main purpose of the patch set.
> >
> > I consider it and ask any objection before, so totally I am not
> > disagree on separate it, but I think if  the purpose of the patch set
> > is to clean some misleading for vec(sse/avx) burst, it could still be
> > on a set even separate it to patch.
> 
> I will not be insist on patch separate, but if you separate them, some of fixes
> can be merged early and no need to wait for those part need more review.

Ok, seems that there still something discuss on the code cleaning patch, let me separate it for better review.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice
  2020-09-18  4:41           ` Guo, Jia
@ 2020-09-18  5:39             ` Zhang, Qi Z
  0 siblings, 0 replies; 48+ messages in thread
From: Zhang, Qi Z @ 2020-09-18  5:39 UTC (permalink / raw)
  To: Guo, Jia, Yang, Qiming, Xing, Beilei, Wu, Jingjing, Wang, Haiyue
  Cc: Zhao1, Wei, Richardson, Bruce, dev, Zhang, Helin, mb, Yigit,
	Ferruh, stephen, barbette, Han, YingyaX



> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Friday, September 18, 2020 12:41 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> 
> > -----Original Message-----
> > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Sent: Friday, September 18, 2020 11:41 AM
> > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> > Han, YingyaX <yingyax.han@intel.com>
> > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> >
> >
> > > -----Original Message-----
> > > From: Guo, Jia <jia.guo@intel.com>
> > > Sent: Friday, September 18, 2020 11:20 AM
> > > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> > > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > <haiyue.wang@intel.com>
> > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > barbette@kth.se;
> > > Han, YingyaX <yingyax.han@intel.com>
> > > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > >
> > > Hi, qi
> > >
> > > > -----Original Message-----
> > > > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > > Sent: Thursday, September 17, 2020 7:03 PM
> > > > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > > > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > > <haiyue.wang@intel.com>
> > > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > > > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > > > barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> > > > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Guo, Jia <jia.guo@intel.com>
> > > > > Sent: Thursday, September 17, 2020 3:59 PM
> > > > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > > > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > > > <haiyue.wang@intel.com>
> > > > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > > > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > > > > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > > > mb@smartsharesystems.com; Yigit, Ferruh
> > > > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > > > > barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> > > > > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > > > >
> > > > > The limitation of burst size in vector rx was removed, since it
> > > > > should retrieve as much received packets as possible. And also
> > > > > the scattered receive path should use a wrapper function to
> > > > > achieve the goal of burst maximizing. And do some code cleaning
> > > > > for vector rx
> > path.
> > > > >
> > > > > Bugzilla ID: 516
> > > > > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > > > > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> > > > >
> > > > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > > > Tested-by: Yingya Han <yingyax.han@intel.com>
> > > > > ---
> > > > >  drivers/net/ice/ice_rxtx.h          |  1 +
> > > > >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > > > > drivers/net/ice/ice_rxtx_vec_sse.c  | 56
> > > > > +++++++++++++++++++----------
> > > > >  3 files changed, 49 insertions(+), 31 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/ice/ice_rxtx.h
> > > > > b/drivers/net/ice/ice_rxtx.h index 2fdcfb7d0..3ef5f300d 100644
> > > > > --- a/drivers/net/ice/ice_rxtx.h
> > > > > +++ b/drivers/net/ice/ice_rxtx.h
> > > > > @@ -35,6 +35,7 @@
> > > > >  #define ICE_MAX_RX_BURST
> ICE_RXQ_REARM_THRESH
> > > > >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> > > > >  #define ICE_DESCS_PER_LOOP          4
> > > > > +#define ICE_DESCS_PER_LOOP_AVX	    8
> > > >
> > > > No need to expose this if no external link, better to keep all avx
> > > > stuff inside avx.c
> > > >
> > >
> > > Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.
> > >
> > > > >
> > > > >  #define ICE_FDIR_PKT_LEN	512
> > > > >
> > > > > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > index be50677c2..843e4f32a 100644
> > > > > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > > >  			__m128i dma_addr0;
> > > > >
> > > > >  			dma_addr0 = _mm_setzero_si128();
> > > > > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > > > > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> > > > >  				rxep[i].mbuf = &rxq->fake_mbuf;
> > > > >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> > > > >  						dma_addr0);
> > > > > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > > >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> > > > >
> > > > > +/**
> > > > > + * vPMD raw receive routine, only accept(nb_pkts >=
> > > > > +ICE_DESCS_PER_LOOP_AVX)
> > > > > + *
> > > > > + * Notice:
> > > > > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > > > > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX
> > > > > +power-of-two */
> > > >
> > > > The comment is misleading, it looks like we are going to floor
> > > > align nb_pkts to 2^8, better to reword .
> > > >
> > >
> > > It should be, agree.
> > >
> > > > >  static inline uint16_t
> > > > >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct
> > > > > rte_mbuf **rx_pkts,
> > > > >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > > > > ICE_DESCS_PER_LOOP_AVX 8
> > > > > -
> > > > >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> > > > >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> > > > >  			0, rxq->mbuf_initializer);
> > > > > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> > > > ice_rx_queue
> > > > > *rxq, struct rte_mbuf **rx_pkts,
> > > > >  	return received;
> > > > >  }
> > > > >
> > > > > -/*
> > > > > - * Notice:
> > > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > > - */
> > > > >  uint16_t
> > > > >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> > > > >  		       uint16_t nb_pkts)
> > > > > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue,
> > > > > struct rte_mbuf **rx_pkts,
> > > > >
> > > > >  /**
> > > > >   * vPMD receive routine that reassembles single burst of 32
> > > > > scattered packets
> > > > > - * Notice:
> > > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > >   */
> > > >
> > > > Why we need to remove this? is it still true for this function?
> > > >
> > >
> > > The reason is that this comment is in the calling function "
> > > _ice_recv_raw_pkts_vec_avx2" which process the related thing, no
> > > need to add it more and more in the caller function.
> >
> > I think you remove related comment from the calling function also :)
> >
> > Also I think better to keep this even it's a little bit duplicate,
> > that help people to understand the internal logic
> >
> > >
> > > > >  static uint16_t
> > > > >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct
> > > > > rte_mbuf **rx_pkts, @@ -626,6 +625,9 @@
> > > > ice_recv_scattered_burst_vec_avx2(void
> > > > > *rx_queue, struct rte_mbuf **rx_pkts,
> > > > >  	struct ice_rx_queue *rxq = rx_queue;
> > > > >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> > > > >
> > > > > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > > > > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> > > >
> > > > Is this necessary?  the only consumer of this function is
> > > > ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <=
> > > > ICE_VPMD_RX_BURST it already be guaranteed.
> > >
> > > The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in
> > > this function split_flags have a limit for ICE_VPMD_RX_BURST, so a
> > > checking is need in the function.
> >
> > Can't get this, could tell me is there any case that nb_pkts >
> > ICE_VPMD_RX_BURST?
> >
> 
> I know we just set the hard value here and only one case usage, but I think only
> the caller know what would be the input param, but the calling should not know
> the input param will be, even there is no any caller but the calling still need to
> be complete.

It's in data path where performance is sensitive and also this is just an internal function, we know all the detail, so skip unnecessary route is reasonable, 
to avoid bugs and give necessary warning for future scale, I think RTE_ASSERT is the right way.
> 
> >
> > >
> > > > > +
> > > > >  	/* get some new buffers */
> > > > >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> > > > nb_pkts,
> > > > >  						       split_flags);
> > > > > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> > > > *rx_queue,
> > > > > struct rte_mbuf **rx_pkts,
> > > > >
> > > > >  /**
> > > > >   * vPMD receive routine that reassembles scattered packets.
> > > > > - * Main receive routine that can handle arbitrary burst sizes
> > > > > - * Notice:
> > > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > >   */
> > > >
> > > > Why we need to remove this? isn't it the main routine that be able
> > > > to handle arbitrary burst size?
> > > >
> > >
> > > The question is why we need to said the arbitrary sizes if we
> > > process and return what we could receive packet for maximum? It is
> > > not only useless comment but also maybe bring some confuse I think.
> >
> > Yes arbitrary size description can be removed, as this is assumed to
> > be the default behavior.
> > But the description for nb_pkts should still be kept.
> >
> > >
> > > > Btw, I will suggest all AVX2 changes can be in a separate patch,
> > > > because this looks like some code clean and fix.
> > > > its not related with the main purpose of the patch set.
> > >
> > > I consider it and ask any objection before, so totally I am not
> > > disagree on separate it, but I think if  the purpose of the patch
> > > set is to clean some misleading for vec(sse/avx) burst, it could
> > > still be on a set even separate it to patch.
> >
> > I will not be insist on patch separate, but if you separate them, some
> > of fixes can be merged early and no need to wait for those part need more
> review.
> 
> Ok, seems that there still something discuss on the code cleaning patch, let me
> separate it for better review.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs
  2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
                   ` (6 preceding siblings ...)
  2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
@ 2020-10-16  9:44 ` Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 1/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
                     ` (5 more replies)
  7 siblings, 6 replies; 48+ messages in thread
From: Jeff Guo @ 2020-10-16  9:44 UTC (permalink / raw)
  To: jingjing.wu, qi.z.zhang, beilei.xing, haiyue.wang, qiming.yang
  Cc: dev, ferruh.yigit, mb, stephen, barbette, Feifei.wang2,
	bruce.richardson, jia.guo, helin.zhang

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of burst
maximizing.

This patch set aims to maximize vector rx burst for for
ixgbe/i40e/ice/iavf/fm10k PMDs.

Bugzilla ID: 516

v5->v4:
split patch set, this patch only for max burst size issue.
Add back Acked-by which has been added at v2.

v4->v3:
add Fixes tag.

v3->v2:
1:move define into header file.
2:delete some useless doc.

v2->v1:
1:add fm10k driver case
2:refine some doc

Jeff Guo (5):
  net/ixgbe: fix vector rx burst for ixgbe
  net/i40e: fix vector rx burst for i40e
  net/ice: fix vector rx burst for ice
  net/fm10k: fix vector rx burst for fm10k
  net/iavf: fix vector rx burst for iavf

 drivers/net/fm10k/fm10k_rxtx_vec.c       |  39 +++++++--
 drivers/net/i40e/i40e_rxtx_vec_altivec.c |  59 +++++++++----
 drivers/net/i40e/i40e_rxtx_vec_neon.c    |  48 ++++++++---
 drivers/net/i40e/i40e_rxtx_vec_sse.c     |  48 ++++++++---
 drivers/net/iavf/iavf_rxtx_vec_sse.c     | 103 +++++++++++++++++------
 drivers/net/ice/ice_rxtx_vec_sse.c       |  46 +++++++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c  |  61 +++++++++-----
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c   |  47 +++++++----
 8 files changed, 328 insertions(+), 123 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v5 1/5] net/ixgbe: fix vector rx burst for ixgbe
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
@ 2020-10-16  9:44   ` Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 2/5] net/i40e: fix vector rx burst for i40e Jeff Guo
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-10-16  9:44 UTC (permalink / raw)
  To: jingjing.wu, qi.z.zhang, beilei.xing, haiyue.wang, qiming.yang
  Cc: dev, ferruh.yigit, mb, stephen, barbette, Feifei.wang2,
	bruce.richardson, jia.guo, helin.zhang

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Bugzilla ID: 516
Fixes: b20971b6cca0 ("net/ixgbe: implement vector driver for ARM")
Fixes: 0e51f9dc4860 ("net/ixgbe: rename x86 vector driver file")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Tested-by: Feifei Wang <Feifei.wang2@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 61 +++++++++++++++----------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c  | 47 +++++++++++++------
 2 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index aa27ee1777..4c81ae9dcf 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -130,17 +130,6 @@ desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
 	rx_pkts[3]->ol_flags = vol.e[3];
 }
 
-/*
- * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
-
 #define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
 #define IXGBE_UINT8_BIT			(CHAR_BIT * sizeof(uint8_t))
 
@@ -206,6 +195,13 @@ desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,
 				vgetq_lane_u32(tunnel_check, 3));
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -226,9 +222,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
 				 rxq->crc_len, 0, 0, 0};
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -382,13 +375,11 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
+/**
  * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  * - don't support ol_flags for rss and csum err
  */
@@ -399,19 +390,17 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets
  *
  * Notice:
  * - don't support ol_flags for rss and csum err
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
@@ -443,6 +432,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index e77a7f31ce..2bea39a41c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -302,13 +302,11 @@ desc_to_ptype_v(__m128i descs[4], uint16_t pkt_type_mask,
 		get_packet_type(3, pkt_info, etqf_check, tunnel_check);
 }
 
-/*
+/**
  * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
 static inline uint16_t
@@ -344,9 +342,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	__m128i mbuf_init;
 	uint8_t vlan_flags;
 
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
 
@@ -556,13 +551,11 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/*
+/**
  * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
 uint16_t
@@ -572,18 +565,16 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/*
+/**
  * vPMD receive routine that reassembles scattered packets
  *
  * Notice:
  * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
  * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
  */
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct ixgbe_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
@@ -615,6 +606,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_IXGBE_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_IXGBE_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static inline void
 vtx1(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v5 2/5] net/i40e: fix vector rx burst for i40e
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 1/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
@ 2020-10-16  9:44   ` Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 3/5] net/ice: fix vector rx burst for ice Jeff Guo
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-10-16  9:44 UTC (permalink / raw)
  To: jingjing.wu, qi.z.zhang, beilei.xing, haiyue.wang, qiming.yang
  Cc: dev, ferruh.yigit, mb, stephen, barbette, Feifei.wang2,
	bruce.richardson, jia.guo, helin.zhang

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Bugzilla ID: 516
Fixes: 5b463eda8d26 ("net/i40e: make vector driver filenames consistent")
Fixes: ae0eb310f253 ("net/i40e: implement vector PMD for ARM")
Fixes: c3def6a8724c ("net/i40e: implement vector PMD for altivec")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/i40e/i40e_rxtx_vec_altivec.c | 59 +++++++++++++++++-------
 drivers/net/i40e/i40e_rxtx_vec_neon.c    | 48 ++++++++++++++-----
 drivers/net/i40e/i40e_rxtx_vec_sse.c     | 48 ++++++++++++++-----
 3 files changed, 114 insertions(+), 41 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_altivec.c b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
index 6862a017e1..d3238bfb6a 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_altivec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
@@ -188,11 +188,13 @@ desc_to_ptype_v(vector unsigned long descs[4], struct rte_mbuf **rx_pkts,
 		ptype_tbl[(*(vector unsigned char *)&ptype1)[8]];
 }
 
- /* Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
+ */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
@@ -214,9 +216,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		};
 	vector unsigned long dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -459,15 +458,15 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
-  * Notice:
-  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
-  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
-  *   numbers of DD bits
-  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct i40e_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
@@ -500,6 +499,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 	struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 543ecadb07..f094de69ae 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -187,11 +187,12 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 
 }
 
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
@@ -230,9 +231,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 		0, 0, 0       /* ignore non-length fields */
 		};
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -439,15 +437,15 @@ i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
@@ -482,6 +480,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 240ce478ab..4b2b6a28fc 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -342,11 +342,12 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
 }
 
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -378,9 +379,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 
@@ -605,15 +603,15 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
- /* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ *
  * Notice:
  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- *   numbers of DD bits
  */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			     uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 
 	struct i40e_rx_queue *rxq = rx_queue;
@@ -648,6 +646,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      RTE_I40E_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_I40E_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + i40e_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct i40e_tx_desc *txdp,
 		struct rte_mbuf *pkt, uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v5 3/5] net/ice: fix vector rx burst for ice
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 1/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 2/5] net/i40e: fix vector rx burst for i40e Jeff Guo
@ 2020-10-16  9:44   ` Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 4/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-10-16  9:44 UTC (permalink / raw)
  To: jingjing.wu, qi.z.zhang, beilei.xing, haiyue.wang, qiming.yang
  Cc: dev, ferruh.yigit, mb, stephen, barbette, Feifei.wang2,
	bruce.richardson, jia.guo, helin.zhang, Yingya Han

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Bugzilla ID: 516
Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Tested-by: Yingya Han <yingyax.han@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/ice/ice_rxtx_vec_sse.c | 46 +++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 1afd96ac9d..e950c1b922 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -254,10 +254,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 }
 
 /**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -314,9 +315,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 
@@ -560,15 +558,15 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ *
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
  */
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
 {
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
@@ -602,6 +600,32 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 					     &split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     nb_pkts);
+}
+
 static inline void
 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v5 4/5] net/fm10k: fix vector rx burst for fm10k
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
                     ` (2 preceding siblings ...)
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 3/5] net/ice: fix vector rx burst for ice Jeff Guo
@ 2020-10-16  9:44   ` Jeff Guo
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf Jeff Guo
  2020-10-23 10:11   ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Zhang, Qi Z
  5 siblings, 0 replies; 48+ messages in thread
From: Jeff Guo @ 2020-10-16  9:44 UTC (permalink / raw)
  To: jingjing.wu, qi.z.zhang, beilei.xing, haiyue.wang, qiming.yang
  Cc: dev, ferruh.yigit, mb, stephen, barbette, Feifei.wang2,
	bruce.richardson, jia.guo, helin.zhang

The scattered receive path should use a wrapper function to achieve the
goal of burst maximizing.

Bugzilla ID: 516
Fixes: fe65e1e1ce61 ("fm10k: add vector scatter Rx")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c | 39 ++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index eff3933b5c..6fcc939ad9 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -645,18 +645,15 @@ fm10k_reassemble_packets(struct fm10k_rx_queue *rxq,
 	return pkt_idx;
 }
 
-/*
- * vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  *
  * Notice:
  * - don't support ol_flags for rss and csum err
- * - nb_pkts > RTE_FM10K_MAX_RX_BURST, only scan RTE_FM10K_MAX_RX_BURST
- *   numbers of DD bit
  */
-uint16_t
-fm10k_recv_scattered_pkts_vec(void *rx_queue,
-				struct rte_mbuf **rx_pkts,
-				uint16_t nb_pkts)
+static uint16_t
+fm10k_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			       uint16_t nb_pkts)
 {
 	struct fm10k_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[RTE_FM10K_MAX_RX_BURST] = {0};
@@ -691,6 +688,32 @@ fm10k_recv_scattered_pkts_vec(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+fm10k_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > RTE_FM10K_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = fm10k_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       RTE_FM10K_MAX_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < RTE_FM10K_MAX_RX_BURST)
+			return retval;
+	}
+
+	return retval + fm10k_recv_scattered_burst_vec(rx_queue,
+						       rx_pkts + retval,
+						       nb_pkts);
+}
+
 static const struct fm10k_txq_ops vec_txq_ops = {
 	.reset = fm10k_reset_tx_queue,
 };
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
                     ` (3 preceding siblings ...)
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 4/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
@ 2020-10-16  9:44   ` Jeff Guo
  2020-10-23  5:09     ` Ling, WeiX
  2020-10-23 10:11   ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Zhang, Qi Z
  5 siblings, 1 reply; 48+ messages in thread
From: Jeff Guo @ 2020-10-16  9:44 UTC (permalink / raw)
  To: jingjing.wu, qi.z.zhang, beilei.xing, haiyue.wang, qiming.yang
  Cc: dev, ferruh.yigit, mb, stephen, barbette, Feifei.wang2,
	bruce.richardson, jia.guo, helin.zhang

The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.

Bugzilla ID: 516
Fixes: 319c421f3890 ("net/avf: enable SSE Rx Tx")
Fixes: 1162f5a0ef31 ("net/iavf: support flexible Rx descriptor in SSE path")
Fixes: 5b6e8859081d ("net/iavf: support flexible Rx descriptor in AVX path")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/iavf/iavf_rxtx_vec_sse.c | 103 ++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 85c5bd4af0..11acaa029e 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -379,10 +379,12 @@ flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 	rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)];
 }
 
-/* Notice:
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -413,9 +415,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 	__m128i dd_check, eop_check;
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -627,10 +626,13 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/* Notice:
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
@@ -688,9 +690,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
 	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
 
@@ -945,15 +944,15 @@ iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ *
  * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
  */
-uint16_t
-iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			      uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
@@ -986,16 +985,43 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		&split_flags[i]);
 }
 
-/* vPMD receive routine that reassembles scattered packets for flex RxD
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
+ *
  * Notice:
  * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- *   numbers of DD bits
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
-				      struct rte_mbuf **rx_pkts,
-				      uint16_t nb_pkts)
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
@@ -1028,6 +1054,33 @@ iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 		&split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static inline void
 vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
 {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf Jeff Guo
@ 2020-10-23  5:09     ` Ling, WeiX
  0 siblings, 0 replies; 48+ messages in thread
From: Ling, WeiX @ 2020-10-23  5:09 UTC (permalink / raw)
  To: Guo, Jia, Wu, Jingjing, Zhang, Qi Z, Xing, Beilei, Wang, Haiyue,
	Yang, Qiming
  Cc: dev, Yigit, Ferruh, mb, stephen, barbette, Feifei.wang2,
	Richardson, Bruce, Guo, Jia, Zhang, Helin

Tested-by: Ling, Wei <weix.ling@intel.com>

Regards,
Ling Wei

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Jeff Guo
> Sent: Friday, October 16, 2020 05:45 PM
> To: Wu, Jingjing <jingjing.wu@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wang, Haiyue
> <haiyue.wang@intel.com>; Yang, Qiming <qiming.yang@intel.com>
> Cc: dev@dpdk.org; Yigit, Ferruh <ferruh.yigit@intel.com>;
> mb@smartsharesystems.com; stephen@networkplumber.org;
> barbette@kth.se; Feifei.wang2@arm.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Guo, Jia <jia.guo@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>
> Subject: [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf
> 
> The limitation of burst size in vector rx was removed, since it should retrieve
> as much received packets as possible. And also the scattered receive path
> should use a wrapper function to achieve the goal of burst maximizing.
> 
> Bugzilla ID: 516
> Fixes: 319c421f3890 ("net/avf: enable SSE Rx Tx")
> Fixes: 1162f5a0ef31 ("net/iavf: support flexible Rx descriptor in SSE path")
> Fixes: 5b6e8859081d ("net/iavf: support flexible Rx descriptor in AVX path")
> 
> Signed-off-by: Jeff Guo <jia.guo@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  drivers/net/iavf/iavf_rxtx_vec_sse.c | 103 ++++++++++++++++++++-------
>  1 file changed, 78 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c
> b/drivers/net/iavf/iavf_rxtx_vec_sse.c
> index 85c5bd4af0..11acaa029e 100644
> --- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
> +++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
> @@ -379,10 +379,12 @@ flex_desc_to_ptype_v(__m128i descs[4], struct
> rte_mbuf **rx_pkts,
>  	rx_pkts[3]->packet_type =
> type_table[_mm_extract_epi16(ptype_all, 7)];  }
> 
> -/* Notice:
> +/**
> + * vPMD raw receive routine, only accept(nb_pkts >=
> +IAVF_VPMD_DESCS_PER_LOOP)
> + *
> + * Notice:
>   * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan
> IAVF_VPMD_RX_MAX_BURST
> - *   numbers of DD bits
> + * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
>   */
>  static inline uint16_t
>  _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> @@ -413,9 +415,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq,
> struct rte_mbuf **rx_pkts,
>  			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
>  	__m128i dd_check, eop_check;
> 
> -	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
> -	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
> -
>  	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP
> */
>  	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts,
> IAVF_VPMD_DESCS_PER_LOOP);
> 
> @@ -627,10 +626,13 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq,
> struct rte_mbuf **rx_pkts,
>  	return nb_pkts_recd;
>  }
> 
> -/* Notice:
> +/**
> + * vPMD raw receive routine for flex RxD,
> + * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
> + *
> + * Notice:
>   * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan
> IAVF_VPMD_RX_MAX_BURST
> - *   numbers of DD bits
> + * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
>   */
>  static inline uint16_t
>  _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, @@ -688,9 +690,6
> @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
>  	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
>  						 0x0000000200000002LL);
> 
> -	/* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
> -	nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
> -
>  	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP
> */
>  	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts,
> IAVF_VPMD_DESCS_PER_LOOP);
> 
> @@ -945,15 +944,15 @@ iavf_recv_pkts_vec_flex_rxd(void *rx_queue,
> struct rte_mbuf **rx_pkts,
>  	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts,
> NULL);  }
> 
> -/* vPMD receive routine that reassembles scattered packets
> +/**
> + * vPMD receive routine that reassembles single burst of 32 scattered
> +packets
> + *
>   * Notice:
>   * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > VPMD_RX_MAX_BURST, only scan
> IAVF_VPMD_RX_MAX_BURST
> - *   numbers of DD bits
>   */
> -uint16_t
> -iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> -			    uint16_t nb_pkts)
> +static uint16_t
> +iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +			      uint16_t nb_pkts)
>  {
>  	struct iavf_rx_queue *rxq = rx_queue;
>  	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; @@ -986,16
> +985,43 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf
> **rx_pkts,
>  		&split_flags[i]);
>  }
> 
> -/* vPMD receive routine that reassembles scattered packets for flex RxD
> +/**
> + * vPMD receive routine that reassembles scattered packets.
> + */
> +uint16_t
> +iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +			     uint16_t nb_pkts)
> +{
> +	uint16_t retval = 0;
> +
> +	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
> +		uint16_t burst;
> +
> +		burst = iavf_recv_scattered_burst_vec(rx_queue,
> +						      rx_pkts + retval,
> +
> IAVF_VPMD_RX_MAX_BURST);
> +		retval += burst;
> +		nb_pkts -= burst;
> +		if (burst < IAVF_VPMD_RX_MAX_BURST)
> +			return retval;
> +	}
> +
> +	return retval + iavf_recv_scattered_burst_vec(rx_queue,
> +						      rx_pkts + retval,
> +						      nb_pkts);
> +}
> +
> +/**
> + * vPMD receive routine that reassembles single burst of 32 scattered
> +packets
> + * for flex RxD
> + *
>   * Notice:
>   * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > VPMD_RX_MAX_BURST, only scan
> IAVF_VPMD_RX_MAX_BURST
> - *   numbers of DD bits
>   */
> -uint16_t
> -iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
> -				      struct rte_mbuf **rx_pkts,
> -				      uint16_t nb_pkts)
> +static uint16_t
> +iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
> +				       struct rte_mbuf **rx_pkts,
> +				       uint16_t nb_pkts)
>  {
>  	struct iavf_rx_queue *rxq = rx_queue;
>  	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; @@ -1028,6
> +1054,33 @@ iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
>  		&split_flags[i]);
>  }
> 
> +/**
> + * vPMD receive routine that reassembles scattered packets for flex RxD
> +*/ uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
> +				      struct rte_mbuf **rx_pkts,
> +				      uint16_t nb_pkts)
> +{
> +	uint16_t retval = 0;
> +
> +	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
> +		uint16_t burst;
> +
> +		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
> +						rx_pkts + retval,
> +
> 	IAVF_VPMD_RX_MAX_BURST);
> +		retval += burst;
> +		nb_pkts -= burst;
> +		if (burst < IAVF_VPMD_RX_MAX_BURST)
> +			return retval;
> +	}
> +
> +	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
> +						      rx_pkts + retval,
> +						      nb_pkts);
> +}
> +
>  static inline void
>  vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
> {
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs
  2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
                     ` (4 preceding siblings ...)
  2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf Jeff Guo
@ 2020-10-23 10:11   ` Zhang, Qi Z
  5 siblings, 0 replies; 48+ messages in thread
From: Zhang, Qi Z @ 2020-10-23 10:11 UTC (permalink / raw)
  To: Guo, Jia, Wu, Jingjing, Xing, Beilei, Wang, Haiyue, Yang, Qiming
  Cc: dev, Yigit, Ferruh, mb, stephen, barbette, Feifei.wang2,
	Richardson, Bruce, Zhang, Helin



> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Friday, October 16, 2020 5:44 PM
> To: Wu, Jingjing <jingjing.wu@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>;
> Yang, Qiming <qiming.yang@intel.com>
> Cc: dev@dpdk.org; Yigit, Ferruh <ferruh.yigit@intel.com>;
> mb@smartsharesystems.com; stephen@networkplumber.org;
> barbette@kth.se; Feifei.wang2@arm.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Guo, Jia <jia.guo@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>
> Subject: [PATCH v5 0/5] fix vector rx burst for PMDs
> 
> The limitation of burst size in vector rx was removed, since it should retrieve as
> much received packets as possible. And also the scattered receive path should
> use a wrapper function to achieve the goal of burst maximizing.
> 
> This patch set aims to maximize vector rx burst for for
> ixgbe/i40e/ice/iavf/fm10k PMDs.
> 
> Bugzilla ID: 516
> 
> v5->v4:
> split patch set, this patch only for max burst size issue.
> Add back Acked-by which has been added at v2.
> 
> v4->v3:
> add Fixes tag.
> 
> v3->v2:
> 1:move define into header file.
> 2:delete some useless doc.
> 
> v2->v1:
> 1:add fm10k driver case
> 2:refine some doc
> 
> Jeff Guo (5):
>   net/ixgbe: fix vector rx burst for ixgbe
>   net/i40e: fix vector rx burst for i40e
>   net/ice: fix vector rx burst for ice
>   net/fm10k: fix vector rx burst for fm10k
>   net/iavf: fix vector rx burst for iavf
> 
>  drivers/net/fm10k/fm10k_rxtx_vec.c       |  39 +++++++--
>  drivers/net/i40e/i40e_rxtx_vec_altivec.c |  59 +++++++++----
>  drivers/net/i40e/i40e_rxtx_vec_neon.c    |  48 ++++++++---
>  drivers/net/i40e/i40e_rxtx_vec_sse.c     |  48 ++++++++---
>  drivers/net/iavf/iavf_rxtx_vec_sse.c     | 103 +++++++++++++++++------
>  drivers/net/ice/ice_rxtx_vec_sse.c       |  46 +++++++---
>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c  |  61 +++++++++-----
>  drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c   |  47 +++++++----
>  8 files changed, 328 insertions(+), 123 deletions(-)
> 
> --
> 2.20.1

Acked-by: Qi Zhang <qi.z.zhang@intel.com>

Applied to dpdk-next-net-intel.

Thanks
Qi


^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, back to index

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-27  7:54 [dpdk-dev] [PATCH v1 0/4] maximize vector rx burst for PMDs Jeff Guo
2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 1/4] net/ixgbe: maximize vector rx burst for ixgbe Jeff Guo
2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 2/4] net/i40e: maximize vector rx burst for i40e Jeff Guo
2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 3/4] net/ice: maximize vector rx burst for ice Jeff Guo
2020-08-27  7:54 ` [dpdk-dev] [PATCH v1 4/4] net/iavf: maximize vector rx burst for iavf Jeff Guo
2020-08-27  8:40 ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements for nb_pkts Morten Brørup
2020-08-27  9:09   ` Bruce Richardson
2020-08-27  9:31     ` Morten Brørup
2020-08-27  9:43       ` Bruce Richardson
2020-08-27 10:13         ` [dpdk-dev] [RFC] ethdev: rte_eth_rx_burst() requirements fornb_pkts Morten Brørup
2020-08-27 11:41           ` Bruce Richardson
2020-08-28  9:03             ` Morten Brørup
2020-08-28 10:07               ` Bruce Richardson
2020-08-28 10:50                 ` Morten Brørup
2020-08-29 10:15                 ` Morten Brørup
2020-09-09  6:36 ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Jeff Guo
2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 2/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
     [not found]     ` <VI1PR0802MB23518C6B517B6EAD8E018CD49E260@VI1PR0802MB2351.eurprd08.prod.outlook.com>
2020-09-09  9:54       ` [dpdk-dev] 回复: " Feifei Wang
2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 3/5] net/i40e: fix vector rx burst for i40e Jeff Guo
2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 4/5] net/ice: fix vector rx burst for ice Jeff Guo
2020-09-15  7:10     ` Han, YingyaX
2020-09-09  6:36   ` [dpdk-dev] [PATCH v3 5/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
2020-09-09  6:45   ` [dpdk-dev] [PATCH v3 0/5] fix vector rx burst for PMDs Wang, Haiyue
2020-09-09  7:03     ` Guo, Jia
2020-09-09  7:05       ` Wang, Haiyue
2020-09-09  7:43         ` Morten Brørup
2020-09-09  7:55           ` Wang, Haiyue
2020-09-09  8:01             ` Guo, Jia
2020-09-17  7:58 ` [dpdk-dev] [PATCH v4 " Jeff Guo
2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 1/5] net/iavf: fix vector rx burst for iavf Jeff Guo
2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 2/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 3/5] net/i40e: fix vector rx burst for i40e Jeff Guo
2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 4/5] net/ice: fix vector rx burst for ice Jeff Guo
2020-09-17 11:03     ` Zhang, Qi Z
2020-09-18  3:20       ` Guo, Jia
2020-09-18  3:41         ` Zhang, Qi Z
2020-09-18  4:41           ` Guo, Jia
2020-09-18  5:39             ` Zhang, Qi Z
2020-09-17  7:58   ` [dpdk-dev] [PATCH v4 5/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
2020-10-16  9:44 ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Jeff Guo
2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 1/5] net/ixgbe: fix vector rx burst for ixgbe Jeff Guo
2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 2/5] net/i40e: fix vector rx burst for i40e Jeff Guo
2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 3/5] net/ice: fix vector rx burst for ice Jeff Guo
2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 4/5] net/fm10k: fix vector rx burst for fm10k Jeff Guo
2020-10-16  9:44   ` [dpdk-dev] [PATCH v5 5/5] net/iavf: fix vector rx burst for iavf Jeff Guo
2020-10-23  5:09     ` Ling, WeiX
2020-10-23 10:11   ` [dpdk-dev] [PATCH v5 0/5] fix vector rx burst for PMDs Zhang, Qi Z

DPDK patches and discussions

Archives are clonable:
	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev


Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox