* [dpdk-dev] [PATCH v2 1/5] net/ixgbe: maximize vector rx burst for ixgbe
2020-08-27 10:10 [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Jeff Guo
@ 2020-08-27 10:10 ` Jeff Guo
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 2/5] net/i40e: maximize vector rx burst for i40e Jeff Guo
` (4 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Jeff Guo @ 2020-08-27 10:10 UTC (permalink / raw)
To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette
The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.
Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 72 +++++++++++++------------
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 56 ++++++++++---------
2 files changed, 68 insertions(+), 60 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index aa27ee177..580a5ed12 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -130,17 +130,6 @@ desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
rx_pkts[3]->ol_flags = vol.e[3];
}
-/*
- * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- * numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
-
#define IXGBE_VPMD_DESC_EOP_MASK 0x02020202
#define IXGBE_UINT8_BIT (CHAR_BIT * sizeof(uint8_t))
@@ -206,6 +195,13 @@ desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,
vgetq_lane_u32(tunnel_check, 3));
}
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
static inline uint16_t
_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts, uint8_t *split_packet)
@@ -226,9 +222,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
rxq->crc_len, 0, 0, 0};
- /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
@@ -382,16 +375,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
-/*
- * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- * numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
uint16_t
ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -399,19 +382,12 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/*
+/**
* vPMD receive routine that reassembles scattered packets
- *
- * Notice:
- * - don't support ol_flags for rss and csum err
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- * numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
*/
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct ixgbe_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
@@ -443,6 +419,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+ uint16_t burst;
+
+ burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_IXGBE_MAX_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_IXGBE_MAX_RX_BURST)
+ return retval;
+ }
+
+ return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile union ixgbe_adv_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca3166..fb381ca04 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -302,13 +302,11 @@ desc_to_ptype_v(__m128i descs[4], uint16_t pkt_type_mask,
get_packet_type(3, pkt_info, etqf_check, tunnel_check);
}
-/*
+/**
* vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
*
* Notice:
* - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- * numbers of DD bit
* - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
*/
static inline uint16_t
@@ -344,9 +342,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
__m128i mbuf_init;
uint8_t vlan_flags;
- /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
@@ -556,15 +551,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
-/*
- * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- * numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- */
uint16_t
ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -572,18 +558,12 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/*
+/**
* vPMD receive routine that reassembles scattered packets
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- * numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
*/
-uint16_t
-ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+ixgbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct ixgbe_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
@@ -615,6 +595,32 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_IXGBE_MAX_RX_BURST) {
+ uint16_t burst;
+
+ burst = ixgbe_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_IXGBE_MAX_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_IXGBE_MAX_RX_BURST)
+ return retval;
+ }
+
+ return retval + ixgbe_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile union ixgbe_adv_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
--
2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* [dpdk-dev] [PATCH v2 2/5] net/i40e: maximize vector rx burst for i40e
2020-08-27 10:10 [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Jeff Guo
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 1/5] net/ixgbe: maximize vector rx burst for ixgbe Jeff Guo
@ 2020-08-27 10:10 ` Jeff Guo
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 3/5] net/ice: maximize vector rx burst for ice Jeff Guo
` (3 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Jeff Guo @ 2020-08-27 10:10 UTC (permalink / raw)
To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette
The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.
Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
drivers/net/i40e/i40e_rxtx_vec_altivec.c | 61 +++++++++++++++---------
drivers/net/i40e/i40e_rxtx_vec_avx2.c | 13 +----
drivers/net/i40e/i40e_rxtx_vec_neon.c | 55 +++++++++++++--------
drivers/net/i40e/i40e_rxtx_vec_sse.c | 55 +++++++++++++--------
4 files changed, 111 insertions(+), 73 deletions(-)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_altivec.c b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
index 6862a017e..2eb45d5a5 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_altivec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
@@ -188,11 +188,13 @@ desc_to_ptype_v(vector unsigned long descs[4], struct rte_mbuf **rx_pkts,
ptype_tbl[(*(vector unsigned char *)&ptype1)[8]];
}
- /* Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
+ */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts, uint8_t *split_packet)
@@ -214,9 +216,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
};
vector unsigned long dd_check, eop_check;
- /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
@@ -447,11 +446,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
- /* Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
uint16_t
i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -459,15 +453,12 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+/**
+ * vPMD receive routine that reassembles scattered packets
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct i40e_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
@@ -500,6 +491,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_I40E_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_I40E_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile struct i40e_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef1363..690d3d2ba 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -729,10 +729,6 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return received;
}
-/*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- */
uint16_t
i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -740,10 +736,8 @@ i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/*
+/**
* vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
*/
static uint16_t
i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -781,11 +775,8 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
-/*
+/**
* vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
*/
uint16_t
i40e_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 6f874e45b..68558826c 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -187,11 +187,12 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
}
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
* Notice:
* - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
*/
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
@@ -230,9 +231,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
0, 0, 0 /* ignore non-length fields */
};
- /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
@@ -426,12 +424,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
return nb_pkts_recd;
}
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
uint16_t
i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
@@ -439,15 +431,12 @@ i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets
*/
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct i40e_rx_queue *rxq = rx_queue;
@@ -482,6 +471,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_I40E_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_I40E_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile struct i40e_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..d785a495c 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -342,11 +342,12 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
}
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
* Notice:
* - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
*/
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -378,9 +379,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
__m128i dd_check, eop_check;
- /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
@@ -592,12 +590,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
uint16_t
i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -605,15 +597,12 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets
*/
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct i40e_rx_queue *rxq = rx_queue;
@@ -648,6 +637,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_I40E_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_I40E_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile struct i40e_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
--
2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* [dpdk-dev] [PATCH v2 3/5] net/ice: maximize vector rx burst for ice
2020-08-27 10:10 [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Jeff Guo
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 1/5] net/ixgbe: maximize vector rx burst for ixgbe Jeff Guo
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 2/5] net/i40e: maximize vector rx burst for i40e Jeff Guo
@ 2020-08-27 10:10 ` Jeff Guo
2020-08-31 4:41 ` Zhang, Qi Z
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 4/5] net/iavf: maximize vector rx burst for iavf Jeff Guo
` (2 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Jeff Guo @ 2020-08-27 10:10 UTC (permalink / raw)
To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette
The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.
Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
drivers/net/ice/ice_rxtx_vec_avx2.c | 11 +------
drivers/net/ice/ice_rxtx_vec_sse.c | 49 ++++++++++++++++++++---------
2 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..b7e624fda 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -603,10 +603,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return received;
}
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- */
uint16_t
ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -615,9 +611,7 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
}
/**
- * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
+ * vPMD receive routine that reassembles scattered packets
*/
static uint16_t
ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -657,9 +651,6 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
/**
* vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
*/
uint16_t
ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..25ae368cc 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -205,10 +205,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
}
/**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
+ *
* Notice:
* - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- * numbers of DD bits
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
*/
static inline uint16_t
_ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -264,9 +265,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
0x0000000200000002LL);
- /* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
@@ -444,8 +442,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/**
* Notice:
* - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- * numbers of DD bits
*/
uint16_t
ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -454,15 +450,12 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets
*/
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct ice_rx_queue *rxq = rx_queue;
uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
@@ -496,6 +489,32 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > ICE_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = ice_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ ICE_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < ICE_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + ice_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
uint64_t flags)
--
2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/5] net/ice: maximize vector rx burst for ice
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 3/5] net/ice: maximize vector rx burst for ice Jeff Guo
@ 2020-08-31 4:41 ` Zhang, Qi Z
2020-08-31 5:24 ` Jeff Guo
0 siblings, 1 reply; 14+ messages in thread
From: Zhang, Qi Z @ 2020-08-31 4:41 UTC (permalink / raw)
To: Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1, Wei, Wu, Jingjing
Cc: Richardson, Bruce, dev, Zhang, Helin, mb, Yigit, Ferruh, barbette
> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Thursday, August 27, 2020 6:10 PM
> To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> barbette@kth.se
> Subject: [PATCH v2 3/5] net/ice: maximize vector rx burst for ice
>
> The limitation of burst size in vector rx was removed, since it should retrieve as
> much received packets as possible. And also the scattered receive path should
> use a wrapper function to achieve the goal of burst maximizing.
>
> Signed-off-by: Jeff Guo <jia.guo@intel.com>
> ---
> drivers/net/ice/ice_rxtx_vec_avx2.c | 11 +------
> drivers/net/ice/ice_rxtx_vec_sse.c | 49 ++++++++++++++++++++---------
> 2 files changed, 35 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> b/drivers/net/ice/ice_rxtx_vec_avx2.c
> index be50677c2..b7e624fda 100644
> --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> @@ -603,10 +603,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue
> *rxq, struct rte_mbuf **rx_pkts,
> return received;
> }
>
> -/**
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> - */
Actually this is question for all the patches in the patchset.
Why we remove above comment? I think the patch should only target for the case when nb_pkgs > ICE_VPMD_RX_BURST?
For small packet number case, nothing changed, right?
> uint16_t
> ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> uint16_t nb_pkts)
> @@ -615,9 +611,7 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> rte_mbuf **rx_pkts, }
>
> /**
> - * vPMD receive routine that reassembles single burst of 32 scattered packets
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> + * vPMD receive routine that reassembles scattered packets
> */
> static uint16_t
> ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> **rx_pkts, @@ -657,9 +651,6 @@ ice_recv_scattered_burst_vec_avx2(void
> *rx_queue, struct rte_mbuf **rx_pkts,
>
> /**
> * vPMD receive routine that reassembles scattered packets.
> - * Main receive routine that can handle arbitrary burst sizes
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> */
> uint16_t
> ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c
> b/drivers/net/ice/ice_rxtx_vec_sse.c
> index 382ef31f3..25ae368cc 100644
> --- a/drivers/net/ice/ice_rxtx_vec_sse.c
> +++ b/drivers/net/ice/ice_rxtx_vec_sse.c
> @@ -205,10 +205,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct
> rte_mbuf **rx_pkts, }
>
> /**
> + * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
> + *
> * Notice:
> * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
> - * numbers of DD bits
> + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
> */
> static inline uint16_t
> _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> @@ -264,9 +265,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq,
> struct rte_mbuf **rx_pkts,
> const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
> 0x0000000200000002LL);
>
> - /* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
> - nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
> -
> /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
> nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
>
> @@ -444,8 +442,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq,
> struct rte_mbuf **rx_pkts,
> /**
> * Notice:
> * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
> - * numbers of DD bits
> */
> uint16_t
> ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, @@ -454,15
> +450,12 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); }
>
> -/* vPMD receive routine that reassembles scattered packets
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
> - * numbers of DD bits
> +/**
> + * vPMD receive routine that reassembles scattered packets
> */
> -uint16_t
> -ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> - uint16_t nb_pkts)
> +static uint16_t
> +ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> + uint16_t nb_pkts)
> {
> struct ice_rx_queue *rxq = rx_queue;
> uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; @@ -496,6 +489,32 @@
> ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> &split_flags[i]);
> }
>
> +/**
> + * vPMD receive routine that reassembles scattered packets.
> + */
> +uint16_t
> +ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> + uint16_t nb_pkts)
> +{
> + uint16_t retval = 0;
> +
> + while (nb_pkts > ICE_VPMD_RX_BURST) {
> + uint16_t burst;
> +
> + burst = ice_recv_scattered_burst_vec(rx_queue,
> + rx_pkts + retval,
> + ICE_VPMD_RX_BURST);
> + retval += burst;
> + nb_pkts -= burst;
> + if (burst < ICE_VPMD_RX_BURST)
> + return retval;
> + }
> +
> + return retval + ice_recv_scattered_burst_vec(rx_queue,
> + rx_pkts + retval,
> + nb_pkts);
> +}
> +
> static inline void
> ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
> uint64_t flags)
> --
> 2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/5] net/ice: maximize vector rx burst for ice
2020-08-31 4:41 ` Zhang, Qi Z
@ 2020-08-31 5:24 ` Jeff Guo
0 siblings, 0 replies; 14+ messages in thread
From: Jeff Guo @ 2020-08-31 5:24 UTC (permalink / raw)
To: Zhang, Qi Z, Yang, Qiming, Xing, Beilei, Zhao1, Wei, Wu, Jingjing
Cc: Richardson, Bruce, dev, Zhang, Helin, mb, Yigit, Ferruh, barbette
hi, qi
On 8/31/2020 12:41 PM, Zhang, Qi Z wrote:
>
>> -----Original Message-----
>> From: Guo, Jia <jia.guo@intel.com>
>> Sent: Thursday, August 27, 2020 6:10 PM
>> To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
>> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z
>> <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
>> <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
>> mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
>> barbette@kth.se
>> Subject: [PATCH v2 3/5] net/ice: maximize vector rx burst for ice
>>
>> The limitation of burst size in vector rx was removed, since it should retrieve as
>> much received packets as possible. And also the scattered receive path should
>> use a wrapper function to achieve the goal of burst maximizing.
>>
>> Signed-off-by: Jeff Guo <jia.guo@intel.com>
>> ---
>> drivers/net/ice/ice_rxtx_vec_avx2.c | 11 +------
>> drivers/net/ice/ice_rxtx_vec_sse.c | 49 ++++++++++++++++++++---------
>> 2 files changed, 35 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
>> b/drivers/net/ice/ice_rxtx_vec_avx2.c
>> index be50677c2..b7e624fda 100644
>> --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
>> +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
>> @@ -603,10 +603,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue
>> *rxq, struct rte_mbuf **rx_pkts,
>> return received;
>> }
>>
>> -/**
>> - * Notice:
>> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>> - */
> Actually this is question for all the patches in the patchset.
> Why we remove above comment? I think the patch should only target for the case when nb_pkgs > ICE_VPMD_RX_BURST?
> For small packet number case, nothing changed, right?
This is just for doc clean, for the reason that there are no need to
duplicate the doc on these layer-by-layer helper functions which are not
reflect on code.
And yes, there is not changed for small packet number case.
>
>> uint16_t
>> ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
>> uint16_t nb_pkts)
>> @@ -615,9 +611,7 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
>> rte_mbuf **rx_pkts, }
>>
>> /**
>> - * vPMD receive routine that reassembles single burst of 32 scattered packets
>> - * Notice:
>> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>> + * vPMD receive routine that reassembles scattered packets
>> */
>> static uint16_t
>> ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
>> **rx_pkts, @@ -657,9 +651,6 @@ ice_recv_scattered_burst_vec_avx2(void
>> *rx_queue, struct rte_mbuf **rx_pkts,
>>
>> /**
>> * vPMD receive routine that reassembles scattered packets.
>> - * Main receive routine that can handle arbitrary burst sizes
>> - * Notice:
>> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>> */
>> uint16_t
>> ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
>> diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c
>> b/drivers/net/ice/ice_rxtx_vec_sse.c
>> index 382ef31f3..25ae368cc 100644
>> --- a/drivers/net/ice/ice_rxtx_vec_sse.c
>> +++ b/drivers/net/ice/ice_rxtx_vec_sse.c
>> @@ -205,10 +205,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct
>> rte_mbuf **rx_pkts, }
>>
>> /**
>> + * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
>> + *
>> * Notice:
>> * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>> - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
>> - * numbers of DD bits
>> + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
>> */
>> static inline uint16_t
>> _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>> @@ -264,9 +265,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq,
>> struct rte_mbuf **rx_pkts,
>> const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
>> 0x0000000200000002LL);
>>
>> - /* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
>> - nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
>> -
>> /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
>> nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
>>
>> @@ -444,8 +442,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq,
>> struct rte_mbuf **rx_pkts,
>> /**
>> * Notice:
>> * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>> - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
>> - * numbers of DD bits
>> */
>> uint16_t
>> ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, @@ -454,15
>> +450,12 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>> return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); }
>>
>> -/* vPMD receive routine that reassembles scattered packets
>> - * Notice:
>> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>> - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
>> - * numbers of DD bits
>> +/**
>> + * vPMD receive routine that reassembles scattered packets
>> */
>> -uint16_t
>> -ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>> - uint16_t nb_pkts)
>> +static uint16_t
>> +ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>> + uint16_t nb_pkts)
>> {
>> struct ice_rx_queue *rxq = rx_queue;
>> uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; @@ -496,6 +489,32 @@
>> ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>> &split_flags[i]);
>> }
>>
>> +/**
>> + * vPMD receive routine that reassembles scattered packets.
>> + */
>> +uint16_t
>> +ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>> + uint16_t nb_pkts)
>> +{
>> + uint16_t retval = 0;
>> +
>> + while (nb_pkts > ICE_VPMD_RX_BURST) {
>> + uint16_t burst;
>> +
>> + burst = ice_recv_scattered_burst_vec(rx_queue,
>> + rx_pkts + retval,
>> + ICE_VPMD_RX_BURST);
>> + retval += burst;
>> + nb_pkts -= burst;
>> + if (burst < ICE_VPMD_RX_BURST)
>> + return retval;
>> + }
>> +
>> + return retval + ice_recv_scattered_burst_vec(rx_queue,
>> + rx_pkts + retval,
>> + nb_pkts);
>> +}
>> +
>> static inline void
>> ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
>> uint64_t flags)
>> --
>> 2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* [dpdk-dev] [PATCH v2 4/5] net/iavf: maximize vector rx burst for iavf
2020-08-27 10:10 [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Jeff Guo
` (2 preceding siblings ...)
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 3/5] net/ice: maximize vector rx burst for ice Jeff Guo
@ 2020-08-27 10:10 ` Jeff Guo
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 5/5] net/fm10k: maximize vector rx burst for fm10k Jeff Guo
2020-08-27 12:38 ` [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Morten Brørup
5 siblings, 0 replies; 14+ messages in thread
From: Jeff Guo @ 2020-08-27 10:10 UTC (permalink / raw)
To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette
The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.
Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
drivers/net/iavf/iavf_rxtx_vec_avx2.c | 21 +----
drivers/net/iavf/iavf_rxtx_vec_sse.c | 107 ++++++++++++++++----------
2 files changed, 68 insertions(+), 60 deletions(-)
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index e5e0fd309..35166d4c6 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -1224,10 +1224,6 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
return received;
}
-/**
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- */
uint16_t
iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -1235,10 +1231,6 @@ iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/**
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- */
uint16_t
iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -1249,8 +1241,6 @@ iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
/**
* vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
*/
static uint16_t
iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -1290,9 +1280,6 @@ iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
/**
* vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
*/
uint16_t
iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -1313,10 +1300,7 @@ iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
}
/**
- * vPMD receive routine that reassembles single burst of
- * 32 scattered packets for flex RxD
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
+ * vPMD receive routine that reassembles scattered packets for flex RxD
*/
static uint16_t
iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
@@ -1357,9 +1341,6 @@ iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
/**
* vPMD receive routine that reassembles scattered packets for flex RxD.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
*/
uint16_t
iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 85c5bd4af..a78a741dd 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -379,10 +379,12 @@ flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)];
}
-/* Notice:
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
* - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- * numbers of DD bits
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
*/
static inline uint16_t
_recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -413,9 +415,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
__m128i dd_check, eop_check;
- /* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
@@ -627,11 +626,6 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
-/* Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- * numbers of DD bits
- */
static inline uint16_t
_recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
struct rte_mbuf **rx_pkts,
@@ -688,9 +682,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
0x0000000200000002LL);
- /* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST);
-
/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
@@ -921,11 +912,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq,
return nb_pkts_recd;
}
-/* Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- * numbers of DD bits
- */
uint16_t
iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -933,11 +919,6 @@ iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/* Notice:
- * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- * numbers of DD bits
- */
uint16_t
iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -945,15 +926,12 @@ iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets
*/
-uint16_t
-iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct iavf_rx_queue *rxq = rx_queue;
uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
@@ -986,16 +964,39 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
-/* vPMD receive routine that reassembles scattered packets for flex RxD
- * Notice:
- * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles scattered packets.
*/
uint16_t
-iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
- struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+ uint16_t burst;
+
+ burst = iavf_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ IAVF_VPMD_RX_MAX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < IAVF_VPMD_RX_MAX_BURST)
+ return retval;
+ }
+
+ return retval + iavf_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct iavf_rx_queue *rxq = rx_queue;
uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
@@ -1028,6 +1029,32 @@ iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+ uint16_t burst;
+
+ burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+ rx_pkts + retval,
+ IAVF_VPMD_RX_MAX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < IAVF_VPMD_RX_MAX_BURST)
+ return retval;
+ }
+
+ return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+ rx_pkts + retval, nb_pkts);
+}
+
static inline void
vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
{
--
2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* [dpdk-dev] [PATCH v2 5/5] net/fm10k: maximize vector rx burst for fm10k
2020-08-27 10:10 [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Jeff Guo
` (3 preceding siblings ...)
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 4/5] net/iavf: maximize vector rx burst for iavf Jeff Guo
@ 2020-08-27 10:10 ` Jeff Guo
2020-08-27 12:38 ` [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Morten Brørup
5 siblings, 0 replies; 14+ messages in thread
From: Jeff Guo @ 2020-08-27 10:10 UTC (permalink / raw)
To: qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
Cc: bruce.richardson, dev, jia.guo, helin.zhang, mb, ferruh.yigit, barbette
The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing.
Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
drivers/net/fm10k/fm10k_rxtx_vec.c | 39 +++++++++++++++++++++++-------
1 file changed, 30 insertions(+), 9 deletions(-)
diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index eff3933b5..8d413b542 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -645,25 +645,20 @@ fm10k_reassemble_packets(struct fm10k_rx_queue *rxq,
return pkt_idx;
}
-/*
+/**
* vPMD receive routine that reassembles scattered packets
*
* Notice:
* - don't support ol_flags for rss and csum err
- * - nb_pkts > RTE_FM10K_MAX_RX_BURST, only scan RTE_FM10K_MAX_RX_BURST
- * numbers of DD bit
*/
-uint16_t
-fm10k_recv_scattered_pkts_vec(void *rx_queue,
- struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+fm10k_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct fm10k_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_FM10K_MAX_RX_BURST] = {0};
unsigned i = 0;
- /* Split_flags only can support max of RTE_FM10K_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_FM10K_MAX_RX_BURST);
/* get some new buffers */
uint16_t nb_bufs = fm10k_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
split_flags);
@@ -691,6 +686,32 @@ fm10k_recv_scattered_pkts_vec(void *rx_queue,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+fm10k_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_FM10K_MAX_RX_BURST) {
+ uint16_t burst;
+
+ burst = fm10k_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_FM10K_MAX_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_FM10K_MAX_RX_BURST)
+ return retval;
+ }
+
+ return retval + fm10k_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static const struct fm10k_txq_ops vec_txq_ops = {
.reset = fm10k_reset_tx_queue,
};
--
2.20.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
2020-08-27 10:10 [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Jeff Guo
` (4 preceding siblings ...)
2020-08-27 10:10 ` [dpdk-dev] [PATCH v2 5/5] net/fm10k: maximize vector rx burst for fm10k Jeff Guo
@ 2020-08-27 12:38 ` Morten Brørup
2020-08-28 2:06 ` Wang, Haiyue
5 siblings, 1 reply; 14+ messages in thread
From: Morten Brørup @ 2020-08-27 12:38 UTC (permalink / raw)
To: Jeff Guo, qiming.yang, beilei.xing, wei.zhao1, qi.z.zhang, jingjing.wu
Cc: bruce.richardson, dev, helin.zhang, ferruh.yigit, barbette
> From: Jeff Guo [mailto:jia.guo@intel.com]
> Sent: Thursday, August 27, 2020 12:10 PM
>
> The limitation of burst size in vector rx was removed, since it should
> retrieve as much received packets as possible. And also the scattered
> receive path should use a wrapper function to achieve the goal of
> burst maximizing.
>
> This patch set aims to maximize vector rx burst for for
> ixgbe/i40e/ice/iavf/fm10k PMDs.
>
> v2->v1:
> 1:add fm10k driver case
> 2:refine some doc
>
I now noticed that the vector functions also does:
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
I am not sure about this, but if I read it correctly, calling rte_eth_rx_burst() with nb_pkts = 33 (not 32) would only return 32 packets, even if more packets are available. (I assume that RTE_I40E_DESCS_PER_LOOP is 32.) In this case, I guess that you need to read the remaining of the requested packets using the non-vector function in order to support any nb_pkts value.
That is, unless the rte_eth_rx_burst() API is extended with requirements to nb_pkts, as discussed in the other thread:
http://inbox.dpdk.org/dev/20200827114117.GD569@bricha3-MOBL.ger.corp.intel.com/T/#mc8051e9022d6aeb20c51c5a226b2274d3d6d4266
Med venlig hilsen / kind regards
- Morten Brørup
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
2020-08-27 12:38 ` [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs Morten Brørup
@ 2020-08-28 2:06 ` Wang, Haiyue
2020-08-28 6:39 ` Jeff Guo
0 siblings, 1 reply; 14+ messages in thread
From: Wang, Haiyue @ 2020-08-28 2:06 UTC (permalink / raw)
To: Morten Brørup, Guo, Jia, Yang, Qiming, Xing, Beilei, Zhao1,
Wei, Zhang, Qi Z, Wu, Jingjing
Cc: Richardson, Bruce, dev, Zhang, Helin, Yigit, Ferruh, barbette
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Morten Brørup
> Sent: Thursday, August 27, 2020 20:38
> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>;
> Yigit, Ferruh <ferruh.yigit@intel.com>; barbette@kth.se
> Subject: Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
>
> > From: Jeff Guo [mailto:jia.guo@intel.com]
> > Sent: Thursday, August 27, 2020 12:10 PM
> >
> > The limitation of burst size in vector rx was removed, since it should
> > retrieve as much received packets as possible. And also the scattered
> > receive path should use a wrapper function to achieve the goal of
> > burst maximizing.
> >
> > This patch set aims to maximize vector rx burst for for
> > ixgbe/i40e/ice/iavf/fm10k PMDs.
> >
> > v2->v1:
> > 1:add fm10k driver case
> > 2:refine some doc
> >
>
> I now noticed that the vector functions also does:
> nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
>
> I am not sure about this, but if I read it correctly, calling rte_eth_rx_burst() with nb_pkts = 33
> (not 32) would only return 32 packets, even if more packets are available. (I assume that
> RTE_I40E_DESCS_PER_LOOP is 32.) In this case, I guess that you need to read the remaining of the
> requested packets using the non-vector function in order to support any nb_pkts value.
>
This is vector instruction handling design requirement, like for avx2: #define ICE_DESCS_PER_LOOP_AVX 8,
if deep into the real loop handling, you will get the point why doing RTE_ALIGN_FLOOR. ;-)
_ice_recv_raw_pkts_vec_avx2:
for (i = 0, received = 0; i < nb_pkts; i += ICE_DESCS_PER_LOOP_AVX, rxdp += ICE_DESCS_PER_LOOP_AVX)
Maybe it is worth to tell PMD to stop using vector by calling rte_eth_rx_queue_setup with the application
burst size it wants.
> That is, unless the rte_eth_rx_burst() API is extended with requirements to nb_pkts, as discussed in
> the other thread:
> http://inbox.dpdk.org/dev/20200827114117.GD569@bricha3-
> MOBL.ger.corp.intel.com/T/#mc8051e9022d6aeb20c51c5a226b2274d3d6d4266
>
>
> Med venlig hilsen / kind regards
> - Morten Brørup
>
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
2020-08-28 2:06 ` Wang, Haiyue
@ 2020-08-28 6:39 ` Jeff Guo
2020-08-28 11:45 ` Morten Brørup
2020-08-28 20:30 ` Stephen Hemminger
0 siblings, 2 replies; 14+ messages in thread
From: Jeff Guo @ 2020-08-28 6:39 UTC (permalink / raw)
To: Wang, Haiyue, Morten Brørup, Yang, Qiming, Xing, Beilei,
Zhao1, Wei, Zhang, Qi Z, Wu, Jingjing
Cc: Richardson, Bruce, dev, Zhang, Helin, Yigit, Ferruh, barbette
On 8/28/2020 10:06 AM, Wang, Haiyue wrote:
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Morten Brørup
>> Sent: Thursday, August 27, 2020 20:38
>> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
>> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
>> Jingjing <jingjing.wu@intel.com>
>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>;
>> Yigit, Ferruh <ferruh.yigit@intel.com>; barbette@kth.se
>> Subject: Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
>>
>>> From: Jeff Guo [mailto:jia.guo@intel.com]
>>> Sent: Thursday, August 27, 2020 12:10 PM
>>>
>>> The limitation of burst size in vector rx was removed, since it should
>>> retrieve as much received packets as possible. And also the scattered
>>> receive path should use a wrapper function to achieve the goal of
>>> burst maximizing.
>>>
>>> This patch set aims to maximize vector rx burst for for
>>> ixgbe/i40e/ice/iavf/fm10k PMDs.
>>>
>>> v2->v1:
>>> 1:add fm10k driver case
>>> 2:refine some doc
>>>
>> I now noticed that the vector functions also does:
>> nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
>>
>> I am not sure about this, but if I read it correctly, calling rte_eth_rx_burst() with nb_pkts = 33
>> (not 32) would only return 32 packets, even if more packets are available. (I assume that
>> RTE_I40E_DESCS_PER_LOOP is 32.) In this case, I guess that you need to read the remaining of the
>> requested packets using the non-vector function in order to support any nb_pkts value.
>>
> This is vector instruction handling design requirement, like for avx2: #define ICE_DESCS_PER_LOOP_AVX 8,
> if deep into the real loop handling, you will get the point why doing RTE_ALIGN_FLOOR. ;-)
>
> _ice_recv_raw_pkts_vec_avx2:
> for (i = 0, received = 0; i < nb_pkts; i += ICE_DESCS_PER_LOOP_AVX, rxdp += ICE_DESCS_PER_LOOP_AVX)
>
> Maybe it is worth to tell PMD to stop using vector by calling rte_eth_rx_queue_setup with the application
> burst size it wants.
>
>> That is, unless the rte_eth_rx_burst() API is extended with requirements to nb_pkts, as discussed in
>> the other thread:
>> http://inbox.dpdk.org/dev/20200827114117.GD569@bricha3-
>> MOBL.ger.corp.intel.com/T/#mc8051e9022d6aeb20c51c5a226b2274d3d6d4266
Agree with above haiyue said, and go through the discuss on the thread,
i think vector path was born definitely for the spirit of dpdk, each
driver could keep the performance base on
the instinct requirement and define what is the specific "max", the path
option could give to app, it could static choose one when set up queue
or dynamic but not the driver scope,
document could be add to AVI if need for benefit user.
>>
>> Med venlig hilsen / kind regards
>> - Morten Brørup
>>
>>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
2020-08-28 6:39 ` Jeff Guo
@ 2020-08-28 11:45 ` Morten Brørup
2020-08-28 20:30 ` Stephen Hemminger
1 sibling, 0 replies; 14+ messages in thread
From: Morten Brørup @ 2020-08-28 11:45 UTC (permalink / raw)
To: Jeff Guo, Wang, Haiyue, Yang, Qiming, Xing, Beilei, Zhao1, Wei,
Zhang, Qi Z, Wu, Jingjing
Cc: Richardson, Bruce, dev, Zhang, Helin, Yigit, Ferruh, barbette
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jeff Guo
> Sent: Friday, August 28, 2020 8:40 AM
>
>
> On 8/28/2020 10:06 AM, Wang, Haiyue wrote:
> >> -----Original Message-----
> >> From: dev <dev-bounces@dpdk.org> On Behalf Of Morten Brørup
> >> Sent: Thursday, August 27, 2020 20:38
> >> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei
> >> <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang, Qi
> Z <qi.z.zhang@intel.com>; Wu,
> >> Jingjing <jingjing.wu@intel.com>
> >> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org;
> Zhang, Helin <helin.zhang@intel.com>;
> >> Yigit, Ferruh <ferruh.yigit@intel.com>; barbette@kth.se
> >> Subject: Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for
> PMDs
> >>
> >>> From: Jeff Guo [mailto:jia.guo@intel.com]
> >>> Sent: Thursday, August 27, 2020 12:10 PM
> >>>
> >>> The limitation of burst size in vector rx was removed, since it
> should
> >>> retrieve as much received packets as possible. And also the
> scattered
> >>> receive path should use a wrapper function to achieve the goal of
> >>> burst maximizing.
> >>>
> >>> This patch set aims to maximize vector rx burst for for
> >>> ixgbe/i40e/ice/iavf/fm10k PMDs.
> >>>
> >>> v2->v1:
> >>> 1:add fm10k driver case
> >>> 2:refine some doc
> >>>
> >> I now noticed that the vector functions also does:
> >> nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
> >>
> >> I am not sure about this, but if I read it correctly, calling
> rte_eth_rx_burst() with nb_pkts = 33
> >> (not 32) would only return 32 packets, even if more packets are
> available. (I assume that
> >> RTE_I40E_DESCS_PER_LOOP is 32.) In this case, I guess that you need
> to read the remaining of the
> >> requested packets using the non-vector function in order to support
> any nb_pkts value.
> >>
> > This is vector instruction handling design requirement, like for
> avx2: #define ICE_DESCS_PER_LOOP_AVX 8,
> > if deep into the real loop handling, you will get the point why doing
> RTE_ALIGN_FLOOR. ;-)
> >
> > _ice_recv_raw_pkts_vec_avx2:
> > for (i = 0, received = 0; i < nb_pkts; i += ICE_DESCS_PER_LOOP_AVX,
> rxdp += ICE_DESCS_PER_LOOP_AVX)
> >
> > Maybe it is worth to tell PMD to stop using vector by calling
> rte_eth_rx_queue_setup with the application
> > burst size it wants.
> >
> >> That is, unless the rte_eth_rx_burst() API is extended with
> requirements to nb_pkts, as discussed in
> >> the other thread:
> >> http://inbox.dpdk.org/dev/20200827114117.GD569@bricha3-
> >> MOBL.ger.corp.intel.com/T/#mc8051e9022d6aeb20c51c5a226b2274d3d6d4266
>
>
> Agree with above haiyue said, and go through the discuss on the thread,
> i think vector path was born definitely for the spirit of dpdk, each
> driver could keep the performance base on
>
> the instinct requirement and define what is the specific "max", the
> path
> option could give to app, it could static choose one when set up queue
> or dynamic but not the driver scope,
>
> document could be add to AVI if need for benefit user.
>
Based on the discussion in the other thread, I think a minimum requirement to nb_pkts will be accepted.
On that note, for the series:
Acked-by: Morten Brørup <mb@smartsharesystems.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
2020-08-28 6:39 ` Jeff Guo
2020-08-28 11:45 ` Morten Brørup
@ 2020-08-28 20:30 ` Stephen Hemminger
2020-08-31 14:27 ` Wang, Haiyue
1 sibling, 1 reply; 14+ messages in thread
From: Stephen Hemminger @ 2020-08-28 20:30 UTC (permalink / raw)
To: Jeff Guo
Cc: Wang, Haiyue, Morten Brørup, Yang, Qiming, Xing, Beilei,
Zhao1, Wei, Zhang, Qi Z, Wu, Jingjing, Richardson, Bruce, dev,
Zhang, Helin, Yigit, Ferruh, barbette
On Fri, 28 Aug 2020 14:39:33 +0800
Jeff Guo <jia.guo@intel.com> wrote:
> >> I am not sure about this, but if I read it correctly, calling rte_eth_rx_burst() with nb_pkts = 33
> >> (not 32) would only return 32 packets, even if more packets are available. (I assume that
> >> RTE_I40E_DESCS_PER_LOOP is 32.) In this case, I guess that you need to read the remaining of the
> >> requested packets using the non-vector function in order to support any nb_pkts value.
> >>
> > This is vector instruction handling design requirement, like for avx2: #define ICE_DESCS_PER_LOOP_AVX 8,
> > if deep into the real loop handling, you will get the point why doing RTE_ALIGN_FLOOR. ;-)
> >
> > _ice_recv_raw_pkts_vec_avx2:
> > for (i = 0, received = 0; i < nb_pkts; i += ICE_DESCS_PER_LOOP_AVX, rxdp += ICE_DESCS_PER_LOOP_AVX)
> >
> > Maybe it is worth to tell PMD to stop using vector by calling rte_eth_rx_queue_setup with the application
> > burst size it wants.
There is no need for yet another nerd knob in DPDK.
The driver must accept any burst size > 0.
If it wants to optimize for certain multiples, then it can do so in its burst handler.
Like any optimized checksum calculator does.
Let the CPU branch predictor do its job and find the best path through the
driver code.
Something like:
uint16_t
my_driver_recv_burst((void *prxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
{
if (nb_pkts == 32)
return my_driver_recv_burst32(prxq, rx_pkts, nb_pkts);
...
else
return my_driver_recv_burstN(prxq, rx_pkts, nb_pkts);
...
}
You could repeatedly call the burst32 if passed large nb_pkts.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
2020-08-28 20:30 ` Stephen Hemminger
@ 2020-08-31 14:27 ` Wang, Haiyue
0 siblings, 0 replies; 14+ messages in thread
From: Wang, Haiyue @ 2020-08-31 14:27 UTC (permalink / raw)
To: Stephen Hemminger, Guo, Jia
Cc: Morten Brørup, Yang, Qiming, Xing, Beilei, Zhao1, Wei,
Zhang, Qi Z, Wu, Jingjing, Richardson, Bruce, dev, Zhang, Helin,
Yigit, Ferruh, barbette
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Saturday, August 29, 2020 04:31
> To: Guo, Jia <jia.guo@intel.com>
> Cc: Wang, Haiyue <haiyue.wang@intel.com>; Morten Brørup <mb@smartsharesystems.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>; Yigit, Ferruh
> <ferruh.yigit@intel.com>; barbette@kth.se
> Subject: Re: [dpdk-dev] [PATCH v2 0/5] maximize vector rx burst for PMDs
>
> On Fri, 28 Aug 2020 14:39:33 +0800
> Jeff Guo <jia.guo@intel.com> wrote:
>
> > >> I am not sure about this, but if I read it correctly, calling rte_eth_rx_burst() with nb_pkts =
> 33
> > >> (not 32) would only return 32 packets, even if more packets are available. (I assume that
> > >> RTE_I40E_DESCS_PER_LOOP is 32.) In this case, I guess that you need to read the remaining of the
> > >> requested packets using the non-vector function in order to support any nb_pkts value.
> > >>
> > > This is vector instruction handling design requirement, like for avx2: #define
> ICE_DESCS_PER_LOOP_AVX 8,
> > > if deep into the real loop handling, you will get the point why doing RTE_ALIGN_FLOOR. ;-)
> > >
> > > _ice_recv_raw_pkts_vec_avx2:
> > > for (i = 0, received = 0; i < nb_pkts; i += ICE_DESCS_PER_LOOP_AVX, rxdp += ICE_DESCS_PER_LOOP_AVX)
> > >
> > > Maybe it is worth to tell PMD to stop using vector by calling rte_eth_rx_queue_setup with the
> application
> > > burst size it wants.
>
> There is no need for yet another nerd knob in DPDK.
>
> The driver must accept any burst size > 0.
> If it wants to optimize for certain multiples, then it can do so in its burst handler.
> Like any optimized checksum calculator does.
I think if people care about performance, then PMDs have done every possible methods like
the burst size, then no need to invent their burst handers. ;-)
At least, burst size is an optimize hit for performance.
^ permalink raw reply [flat|nested] 14+ messages in thread