DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] net/iavf: remove extra copy step in Rx bulk path
@ 2022-03-24 22:11 Kathleen Capella
  2022-03-28 17:31 ` Kathleen Capella
  2022-04-20 12:47 ` Zhang, Qi Z
  0 siblings, 2 replies; 3+ messages in thread
From: Kathleen Capella @ 2022-03-24 22:11 UTC (permalink / raw)
  To: Jingjing Wu, Beilei Xing
  Cc: dev, nd, honnappa.nagarahalli, dharmik.thakkar, Kathleen Capella

In the Rx bulk path, packets which are taken from the HW ring, are first
copied to the stage data structure and then later copied from the stage
to the rx_pkts array. For the number of packets requested immediately
by the receiving function, this two-step process adds extra overhead
that is not necessary.

Instead, put requested number of packets directly into the rx_pkts array
and only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up
to 4% performance improvement. On x86, no difference in performance was
observed.

Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
Suggested-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 16e8d021f9..245dd225fd 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 #define IAVF_LOOK_AHEAD 8
 static inline int
-iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
+			    struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
 {
 	volatile union iavf_rx_flex_desc *rxdp;
 	struct rte_mbuf **rxep;
@@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 	uint16_t pkt_len;
 	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
 	int32_t i, j, nb_rx = 0;
+	int32_t nb_staged = 0;
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 #endif
 		}
 
-		nb_rx += nb_dd;
-
 		/* Translate descriptor info to mbuf parameters */
 		for (j = 0; j < nb_dd; j++) {
 			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 			pkt_flags = iavf_flex_rxd_error_to_pkt_flags(stat_err0);
 
 			mb->ol_flags |= pkt_flags;
-		}
 
-		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-			rxq->rx_stage[i + j] = rxep[j];
+			/* Put up to nb_pkts directly into buffers */
+			if ((i + j) < nb_pkts) {
+				rx_pkts[i + j] = rxep[j];
+				nb_rx++;
+			} else {
+				/* Stage excess pkts received */
+				rxq->rx_stage[nb_staged] = rxep[j];
+				nb_staged++;
+			}
+		}
 
 		if (nb_dd != IAVF_LOOK_AHEAD)
 			break;
 	}
 
+	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
+	rxq->rx_nb_avail = nb_staged;
+
 	/* Clear software ring entries */
-	for (i = 0; i < nb_rx; i++)
+	for (i = 0; i < (nb_rx + nb_staged); i++)
 		rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
 	return nb_rx;
 }
 
 static inline int
-iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	volatile union iavf_rx_desc *rxdp;
 	struct rte_mbuf **rxep;
@@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 	uint32_t rx_status;
 	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
 	int32_t i, j, nb_rx = 0;
+	int32_t nb_staged = 0;
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 #endif
 		}
 
-		nb_rx += nb_dd;
-
 		/* Translate descriptor info to mbuf parameters */
 		for (j = 0; j < nb_dd; j++) {
 			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 				pkt_flags |= iavf_rxd_build_fdir(&rxdp[j], mb);
 
 			mb->ol_flags |= pkt_flags;
-		}
 
-		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-			rxq->rx_stage[i + j] = rxep[j];
+			/* Put up to nb_pkts directly into buffers */
+			if ((i + j) < nb_pkts) {
+				rx_pkts[i + j] = rxep[j];
+				nb_rx++;
+			} else { /* Stage excess pkts received */
+				rxq->rx_stage[nb_staged] = rxep[j];
+				nb_staged++;
+			}
+		}
 
 		if (nb_dd != IAVF_LOOK_AHEAD)
 			break;
 	}
 
+	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
+	rxq->rx_nb_avail = nb_staged;
+
 	/* Clear software ring entries */
-	for (i = 0; i < nb_rx; i++)
+	for (i = 0; i < (nb_rx + nb_staged); i++)
 		rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
 	return nb_rx;
@@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
 
 	if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <= IAVF_RXDID_LAST)
-		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq);
+		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq, rx_pkts, nb_pkts);
 	else
-		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq);
+		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts, nb_pkts);
+
 	rxq->rx_next_avail = 0;
-	rxq->rx_nb_avail = nb_rx;
-	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail);
 
 	if (rxq->rx_tail > rxq->rx_free_trigger) {
 		if (iavf_rx_alloc_bufs(rxq) != 0) {
-			uint16_t i, j;
+			uint16_t i, j, nb_staged;
 
 			/* TODO: count rx_mbuf_alloc_failed here */
 
+			nb_staged = rxq->rx_nb_avail;
 			rxq->rx_nb_avail = 0;
-			rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx);
-			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++)
+
+			rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx + nb_staged));
+			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) {
+				rxq->sw_ring[j] = rx_pkts[i];
+				rx_pkts[i] = NULL;
+			}
+			for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; i++, j++) {
 				rxq->sw_ring[j] = rxq->rx_stage[i];
+				rx_pkts[i] = NULL;
+			}
 
 			return 0;
 		}
@@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		   rxq->port_id, rxq->queue_id,
 		   rxq->rx_tail, nb_rx);
 
-	if (rxq->rx_nb_avail)
-		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
-
-	return 0;
+	return nb_rx;
 }
 
 static uint16_t
-- 
2.31.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH] net/iavf: remove extra copy step in Rx bulk path
  2022-03-24 22:11 [PATCH] net/iavf: remove extra copy step in Rx bulk path Kathleen Capella
@ 2022-03-28 17:31 ` Kathleen Capella
  2022-04-20 12:47 ` Zhang, Qi Z
  1 sibling, 0 replies; 3+ messages in thread
From: Kathleen Capella @ 2022-03-28 17:31 UTC (permalink / raw)
  To: Kathleen Capella, Jingjing Wu, Beilei Xing
  Cc: dev, nd, Honnappa Nagarahalli, Dharmik Thakkar

The failure in ci/iol-broadcom-Functional on this patch seems to be unrelated to the patch.

> -----Original Message-----
> From: Kathleen Capella <kathleen.capella@arm.com>
> Sent: Thursday, March 24, 2022 5:12 PM
> To: Jingjing Wu <jingjing.wu@intel.com>; Beilei Xing <beilei.xing@intel.com>
> Cc: dev@dpdk.org; nd <nd@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Dharmik Thakkar
> <Dharmik.Thakkar@arm.com>; Kathleen Capella
> <Kathleen.Capella@arm.com>
> Subject: [PATCH] net/iavf: remove extra copy step in Rx bulk path
> 
> In the Rx bulk path, packets which are taken from the HW ring, are first
> copied to the stage data structure and then later copied from the stage to the
> rx_pkts array. For the number of packets requested immediately by the
> receiving function, this two-step process adds extra overhead that is not
> necessary.
> 
> Instead, put requested number of packets directly into the rx_pkts array and
> only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up to 4%
> performance improvement. On x86, no difference in performance was
> observed.
> 
> Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
> Suggested-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> ---
>  drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------
>  1 file changed, 49 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index
> 16e8d021f9..245dd225fd 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct
> rte_mbuf **rx_pkts,
> 
>  #define IAVF_LOOK_AHEAD 8
>  static inline int
> -iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
> +iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
> +			    struct rte_mbuf **rx_pkts,
> +			    uint16_t nb_pkts)
>  {
>  	volatile union iavf_rx_flex_desc *rxdp;
>  	struct rte_mbuf **rxep;
> @@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct
> iavf_rx_queue *rxq)
>  	uint16_t pkt_len;
>  	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
>  	int32_t i, j, nb_rx = 0;
> +	int32_t nb_staged = 0;
>  	uint64_t pkt_flags;
>  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> 
> @@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct
> iavf_rx_queue *rxq)  #endif
>  		}
> 
> -		nb_rx += nb_dd;
> -
>  		/* Translate descriptor info to mbuf parameters */
>  		for (j = 0; j < nb_dd; j++) {
>  			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
> @@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct
> iavf_rx_queue *rxq)
>  			pkt_flags =
> iavf_flex_rxd_error_to_pkt_flags(stat_err0);
> 
>  			mb->ol_flags |= pkt_flags;
> -		}
> 
> -		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
> -			rxq->rx_stage[i + j] = rxep[j];
> +			/* Put up to nb_pkts directly into buffers */
> +			if ((i + j) < nb_pkts) {
> +				rx_pkts[i + j] = rxep[j];
> +				nb_rx++;
> +			} else {
> +				/* Stage excess pkts received */
> +				rxq->rx_stage[nb_staged] = rxep[j];
> +				nb_staged++;
> +			}
> +		}
> 
>  		if (nb_dd != IAVF_LOOK_AHEAD)
>  			break;
>  	}
> 
> +	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
> +	rxq->rx_nb_avail = nb_staged;
> +
>  	/* Clear software ring entries */
> -	for (i = 0; i < nb_rx; i++)
> +	for (i = 0; i < (nb_rx + nb_staged); i++)
>  		rxq->sw_ring[rxq->rx_tail + i] = NULL;
> 
>  	return nb_rx;
>  }
> 
>  static inline int
> -iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
> +iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>  {
>  	volatile union iavf_rx_desc *rxdp;
>  	struct rte_mbuf **rxep;
> @@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
>  	uint32_t rx_status;
>  	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
>  	int32_t i, j, nb_rx = 0;
> +	int32_t nb_staged = 0;
>  	uint64_t pkt_flags;
>  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> 
> @@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
> #endif
>  		}
> 
> -		nb_rx += nb_dd;
> -
>  		/* Translate descriptor info to mbuf parameters */
>  		for (j = 0; j < nb_dd; j++) {
>  			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
> @@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue
> *rxq)
>  				pkt_flags |= iavf_rxd_build_fdir(&rxdp[j],
> mb);
> 
>  			mb->ol_flags |= pkt_flags;
> -		}
> 
> -		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
> -			rxq->rx_stage[i + j] = rxep[j];
> +			/* Put up to nb_pkts directly into buffers */
> +			if ((i + j) < nb_pkts) {
> +				rx_pkts[i + j] = rxep[j];
> +				nb_rx++;
> +			} else { /* Stage excess pkts received */
> +				rxq->rx_stage[nb_staged] = rxep[j];
> +				nb_staged++;
> +			}
> +		}
> 
>  		if (nb_dd != IAVF_LOOK_AHEAD)
>  			break;
>  	}
> 
> +	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
> +	rxq->rx_nb_avail = nb_staged;
> +
>  	/* Clear software ring entries */
> -	for (i = 0; i < nb_rx; i++)
> +	for (i = 0; i < (nb_rx + nb_staged); i++)
>  		rxq->sw_ring[rxq->rx_tail + i] = NULL;
> 
>  	return nb_rx;
> @@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
> 
>  	if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <=
> IAVF_RXDID_LAST)
> -		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq);
> +		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq,
> rx_pkts,
> +nb_pkts);
>  	else
> -		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq);
> +		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts,
> nb_pkts);
> +
>  	rxq->rx_next_avail = 0;
> -	rxq->rx_nb_avail = nb_rx;
> -	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx);
> +	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail);
> 
>  	if (rxq->rx_tail > rxq->rx_free_trigger) {
>  		if (iavf_rx_alloc_bufs(rxq) != 0) {
> -			uint16_t i, j;
> +			uint16_t i, j, nb_staged;
> 
>  			/* TODO: count rx_mbuf_alloc_failed here */
> 
> +			nb_staged = rxq->rx_nb_avail;
>  			rxq->rx_nb_avail = 0;
> -			rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx);
> -			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++)
> +
> +			rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx +
> nb_staged));
> +			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) {
> +				rxq->sw_ring[j] = rx_pkts[i];
> +				rx_pkts[i] = NULL;
> +			}
> +			for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; i++,
> j++) {
>  				rxq->sw_ring[j] = rxq->rx_stage[i];
> +				rx_pkts[i] = NULL;
> +			}
> 
>  			return 0;
>  		}
> @@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  		   rxq->port_id, rxq->queue_id,
>  		   rxq->rx_tail, nb_rx);
> 
> -	if (rxq->rx_nb_avail)
> -		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
> -
> -	return 0;
> +	return nb_rx;
>  }
> 
>  static uint16_t
> --
> 2.31.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH] net/iavf: remove extra copy step in Rx bulk path
  2022-03-24 22:11 [PATCH] net/iavf: remove extra copy step in Rx bulk path Kathleen Capella
  2022-03-28 17:31 ` Kathleen Capella
@ 2022-04-20 12:47 ` Zhang, Qi Z
  1 sibling, 0 replies; 3+ messages in thread
From: Zhang, Qi Z @ 2022-04-20 12:47 UTC (permalink / raw)
  To: Kathleen Capella, Wu, Jingjing, Xing, Beilei
  Cc: dev, nd, honnappa.nagarahalli, dharmik.thakkar



> -----Original Message-----
> From: Kathleen Capella <kathleen.capella@arm.com>
> Sent: Friday, March 25, 2022 6:12 AM
> To: Wu, Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; nd@arm.com; honnappa.nagarahalli@arm.com;
> dharmik.thakkar@arm.com; Kathleen Capella <kathleen.capella@arm.com>
> Subject: [PATCH] net/iavf: remove extra copy step in Rx bulk path
> 
> In the Rx bulk path, packets which are taken from the HW ring, are first
> copied to the stage data structure and then later copied from the stage to the
> rx_pkts array. For the number of packets requested immediately by the
> receiving function, this two-step process adds extra overhead that is not
> necessary.
> 
> Instead, put requested number of packets directly into the rx_pkts array and
> only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up to 4%
> performance improvement. On x86, no difference in performance was
> observed.
> 
> Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
> Suggested-by: Dharmik Thakkar <dharmik.thakkar@arm.com>

Acked-by: Qi Zhang <qi.z.zhang@intel.com>

Applied to dpdk-next-net-intel.

Thanks
Qi


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-04-20 12:47 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-24 22:11 [PATCH] net/iavf: remove extra copy step in Rx bulk path Kathleen Capella
2022-03-28 17:31 ` Kathleen Capella
2022-04-20 12:47 ` Zhang, Qi Z

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).