[PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
@ 2024-01-25  2:42 longli
  2024-01-26  0:29 ` Stephen Hemminger
  2024-01-30  1:13 ` [Patch v2] " longli
  0 siblings, 2 replies; 25+ messages in thread
From: longli @ 2024-01-25  2:42 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev, Long Li

From: Long Li <longli@microsoft.com>

Instead of allocating mbufs one by one during RX, use
rte_pktmbuf_alloc_bulk() to allocate them in a batch.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/net/mana/rx.c | 67 +++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index acad5e26cd..400a4e52f4 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -2,6 +2,7 @@
  * Copyright 2022 Microsoft Corporation
  */
 #include <ethdev_driver.h>
+#include <rte_malloc.h>
 
 #include <infiniband/verbs.h>
 #include <infiniband/manadv.h>
@@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 }
 
 static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
 {
-	struct rte_mbuf *mbuf = NULL;
 	struct gdma_sgl_element sgl[1];
 	struct gdma_work_request request;
 	uint32_t wqe_size_in_bu;
@@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 	int ret;
 	struct mana_mr_cache *mr;
 
-	mbuf = rte_pktmbuf_alloc(rxq->mp);
-	if (!mbuf) {
-		rxq->stats.nombuf++;
-		return -ENOMEM;
-	}
-
 	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
 	if (!mr) {
 		DP_LOG(ERR, "failed to register RX MR");
@@ -121,19 +115,31 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
  * Post work requests for a Rx queue.
  */
 static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
 {
 	int ret;
 	uint32_t i;
+	struct rte_mbuf **mbufs;
+
+	mbufs = rte_calloc("mana_rx_mbufs", count, sizeof(struct rte_mbuf *), 0);
+	if (!mbufs)
+		return -ENOMEM;
+
+	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
+	if (ret) {
+		DP_LOG(ERR, "failed to allocate mbufs for RX");
+		rxq->stats.nombuf += count;
+		goto fail;
+	}
 
 #ifdef RTE_ARCH_32
 	rxq->wqe_cnt_to_short_db = 0;
 #endif
-	for (i = 0; i < rxq->num_desc; i++) {
-		ret = mana_alloc_and_post_rx_wqe(rxq);
+	for (i = 0; i < count; i++) {
+		ret = mana_post_rx_wqe(rxq, mbufs[i]);
 		if (ret) {
 			DP_LOG(ERR, "failed to post RX ret = %d", ret);
-			return ret;
+			goto fail;
 		}
 
 #ifdef RTE_ARCH_32
@@ -146,6 +152,8 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
 
 	mana_rq_ring_doorbell(rxq);
 
+fail:
+	rte_free(mbufs);
 	return ret;
 }
 
@@ -404,7 +412,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < priv->num_queues; i++) {
-		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
 		if (ret)
 			goto fail;
 	}
@@ -423,7 +433,7 @@ uint16_t
 mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	uint16_t pkt_received = 0;
-	uint16_t wqe_posted = 0;
+	uint16_t wqe_consumed = 0;
 	struct mana_rxq *rxq = dpdk_rxq;
 	struct mana_priv *priv = rxq->priv;
 	struct rte_mbuf *mbuf;
@@ -535,18 +545,23 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
 
-		/* Consume this request and post another request */
-		ret = mana_alloc_and_post_rx_wqe(rxq);
-		if (ret) {
-			DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
-			break;
-		}
-
-		wqe_posted++;
+		/* Record the number of the RX WQE we need to post to replenish
+		 * consumed RX requests
+		 */
+		wqe_consumed++;
 		if (pkt_received == pkts_n)
 			break;
 
 #ifdef RTE_ARCH_32
+		/* Always post WQE as soon as it's consumed for short DB */
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+			return pkt_received;
+		}
+		wqe_consumed = 0;
+
 		/* Ring short doorbell if approaching the wqe increment
 		 * limit.
 		 */
@@ -569,8 +584,12 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		goto repoll;
 	}
 
-	if (wqe_posted)
-		mana_rq_ring_doorbell(rxq);
+	if (wqe_consumed) {
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret)
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+	}
 
 	return pkt_received;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-25  2:42 [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
@ 2024-01-26  0:29 ` Stephen Hemminger
  2024-01-26  1:13   ` Long Li
  2024-01-30  1:13 ` [Patch v2] " longli
  1 sibling, 1 reply; 25+ messages in thread
From: Stephen Hemminger @ 2024-01-26  0:29 UTC (permalink / raw)
  To: longli; +Cc: Ferruh Yigit, Andrew Rybchenko, dev, Long Li

On Wed, 24 Jan 2024 18:42:42 -0800
longli@linuxonhyperv.com wrote:

> +	struct rte_mbuf **mbufs;
> +
> +	mbufs = rte_calloc("mana_rx_mbufs", count, sizeof(struct rte_mbuf *), 0);
> +	if (!mbufs)
> +		return -ENOMEM;

Looks good, you might want to make this numa aware in some future version.

Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-26  0:29 ` Stephen Hemminger
@ 2024-01-26  1:13   ` Long Li
  0 siblings, 0 replies; 25+ messages in thread
From: Long Li @ 2024-01-26  1:13 UTC (permalink / raw)
  To: stephen, longli; +Cc: Ferruh Yigit, Andrew Rybchenko, dev

> Subject: Re: [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX
> WQEs
> 
> On Wed, 24 Jan 2024 18:42:42 -0800
> longli@linuxonhyperv.com wrote:
> 
> > +	struct rte_mbuf **mbufs;
> > +
> > +	mbufs = rte_calloc("mana_rx_mbufs", count, sizeof(struct rte_mbuf *), 0);
> > +	if (!mbufs)
> > +		return -ENOMEM;
> 
> Looks good, you might want to make this numa aware in some future version.
> 
> Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>

Thank you!

I'm sending v2 to fix this.

There are a couple of other places in MANA doing memory allocation that should be NUMA aware. I will send another patch to make them NUMA aware.

Long

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-25  2:42 [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
  2024-01-26  0:29 ` Stephen Hemminger
@ 2024-01-30  1:13 ` longli
  2024-01-30 10:19   ` Ferruh Yigit
  2024-02-01  3:45   ` [Patch v3] " longli
  1 sibling, 2 replies; 25+ messages in thread
From: longli @ 2024-01-30  1:13 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev, Long Li

From: Long Li <longli@microsoft.com>

Instead of allocating mbufs one by one during RX, use rte_pktmbuf_alloc_bulk()
to allocate them in a batch.

Signed-off-by: Long Li <longli@microsoft.com>
---
Change in v2:
use rte_calloc_socket() in place of rte_calloc()

 drivers/net/mana/rx.c | 68 ++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 24 deletions(-)

diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index acad5e26cd..b011bf3ea1 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -2,6 +2,7 @@
  * Copyright 2022 Microsoft Corporation
  */
 #include <ethdev_driver.h>
+#include <rte_malloc.h>
 
 #include <infiniband/verbs.h>
 #include <infiniband/manadv.h>
@@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 }
 
 static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
 {
-	struct rte_mbuf *mbuf = NULL;
 	struct gdma_sgl_element sgl[1];
 	struct gdma_work_request request;
 	uint32_t wqe_size_in_bu;
@@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 	int ret;
 	struct mana_mr_cache *mr;
 
-	mbuf = rte_pktmbuf_alloc(rxq->mp);
-	if (!mbuf) {
-		rxq->stats.nombuf++;
-		return -ENOMEM;
-	}
-
 	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
 	if (!mr) {
 		DP_LOG(ERR, "failed to register RX MR");
@@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
  * Post work requests for a Rx queue.
  */
 static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
 {
 	int ret;
 	uint32_t i;
+	struct rte_mbuf **mbufs;
+
+	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
+				  0, rxq->mp->socket_id);
+	if (!mbufs)
+		return -ENOMEM;
+
+	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
+	if (ret) {
+		DP_LOG(ERR, "failed to allocate mbufs for RX");
+		rxq->stats.nombuf += count;
+		goto fail;
+	}
 
 #ifdef RTE_ARCH_32
 	rxq->wqe_cnt_to_short_db = 0;
 #endif
-	for (i = 0; i < rxq->num_desc; i++) {
-		ret = mana_alloc_and_post_rx_wqe(rxq);
+	for (i = 0; i < count; i++) {
+		ret = mana_post_rx_wqe(rxq, mbufs[i]);
 		if (ret) {
 			DP_LOG(ERR, "failed to post RX ret = %d", ret);
-			return ret;
+			goto fail;
 		}
 
 #ifdef RTE_ARCH_32
@@ -146,6 +153,8 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
 
 	mana_rq_ring_doorbell(rxq);
 
+fail:
+	rte_free(mbufs);
 	return ret;
 }
 
@@ -404,7 +413,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < priv->num_queues; i++) {
-		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
 		if (ret)
 			goto fail;
 	}
@@ -423,7 +434,7 @@ uint16_t
 mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	uint16_t pkt_received = 0;
-	uint16_t wqe_posted = 0;
+	uint16_t wqe_consumed = 0;
 	struct mana_rxq *rxq = dpdk_rxq;
 	struct mana_priv *priv = rxq->priv;
 	struct rte_mbuf *mbuf;
@@ -535,18 +546,23 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
 
-		/* Consume this request and post another request */
-		ret = mana_alloc_and_post_rx_wqe(rxq);
-		if (ret) {
-			DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
-			break;
-		}
-
-		wqe_posted++;
+		/* Record the number of the RX WQE we need to post to replenish
+		 * consumed RX requests
+		 */
+		wqe_consumed++;
 		if (pkt_received == pkts_n)
 			break;
 
 #ifdef RTE_ARCH_32
+		/* Always post WQE as soon as it's consumed for short DB */
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+			return pkt_received;
+		}
+		wqe_consumed = 0;
+
 		/* Ring short doorbell if approaching the wqe increment
 		 * limit.
 		 */
@@ -569,8 +585,12 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		goto repoll;
 	}
 
-	if (wqe_posted)
-		mana_rq_ring_doorbell(rxq);
+	if (wqe_consumed) {
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret)
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+	}
 
 	return pkt_received;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30  1:13 ` [Patch v2] " longli
@ 2024-01-30 10:19   ` Ferruh Yigit
  2024-01-30 16:43     ` Stephen Hemminger
  2024-01-30 21:30     ` Long Li
  2024-02-01  3:45   ` [Patch v3] " longli
  1 sibling, 2 replies; 25+ messages in thread
From: Ferruh Yigit @ 2024-01-30 10:19 UTC (permalink / raw)
  To: longli, Andrew Rybchenko; +Cc: dev

On 1/30/2024 1:13 AM, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use rte_pktmbuf_alloc_bulk()
> to allocate them in a batch.
> 
> Signed-off-by: Long Li <longli@microsoft.com>
>

Can you please quantify the performance improvement (as percentage),
this clarifies the impact of the modification.

<...>

> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>   * Post work requests for a Rx queue.
>   */
>  static int
> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>  {
>  	int ret;
>  	uint32_t i;
> +	struct rte_mbuf **mbufs;
> +
> +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
> +				  0, rxq->mp->socket_id);
> +	if (!mbufs)
> +		return -ENOMEM;
>

'mbufs' is temporarily storage for allocated mbuf pointers, why not
allocate if from stack instead, can be faster and easier to manage:
"struct rte_mbuf *mbufs[count]"


> +
> +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> +	if (ret) {
> +		DP_LOG(ERR, "failed to allocate mbufs for RX");
> +		rxq->stats.nombuf += count;
> +		goto fail;
> +	}
>  
>  #ifdef RTE_ARCH_32
>  	rxq->wqe_cnt_to_short_db = 0;
>  #endif
> -	for (i = 0; i < rxq->num_desc; i++) {
> -		ret = mana_alloc_and_post_rx_wqe(rxq);
> +	for (i = 0; i < count; i++) {
> +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
>  		if (ret) {
>  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
> -			return ret;
> +			goto fail;
>

This may leak memory. There are allocated mbufs, if exit from loop here
and free 'mubfs' variable, how remaining mubfs will be freed?



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 10:19   ` Ferruh Yigit
@ 2024-01-30 16:43     ` Stephen Hemminger
  2024-01-30 18:05       ` Tyler Retzlaff
  2024-01-30 22:42       ` Ferruh Yigit
  2024-01-30 21:30     ` Long Li
  1 sibling, 2 replies; 25+ messages in thread
From: Stephen Hemminger @ 2024-01-30 16:43 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: longli, Andrew Rybchenko, dev

On Tue, 30 Jan 2024 10:19:32 +0000
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> > -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> > +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
> >  {
> >  	int ret;
> >  	uint32_t i;
> > +	struct rte_mbuf **mbufs;
> > +
> > +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
> > +				  0, rxq->mp->socket_id);
> > +	if (!mbufs)
> > +		return -ENOMEM;
> >  
> 
> 'mbufs' is temporarily storage for allocated mbuf pointers, why not
> allocate if from stack instead, can be faster and easier to manage:
> "struct rte_mbuf *mbufs[count]"

That would introduce a variable length array.
VLA's should be removed, they are not supported on Windows and many
security tools flag them. The problem is that it makes the code brittle
if count gets huge.

But certainly regular calloc() or alloca() would work here.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 16:43     ` Stephen Hemminger
@ 2024-01-30 18:05       ` Tyler Retzlaff
  2024-01-30 22:42       ` Ferruh Yigit
  1 sibling, 0 replies; 25+ messages in thread
From: Tyler Retzlaff @ 2024-01-30 18:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Ferruh Yigit, longli, Andrew Rybchenko, dev

On Tue, Jan 30, 2024 at 08:43:52AM -0800, Stephen Hemminger wrote:
> On Tue, 30 Jan 2024 10:19:32 +0000
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
> > > -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> > > +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
> > >  {
> > >  	int ret;
> > >  	uint32_t i;
> > > +	struct rte_mbuf **mbufs;
> > > +
> > > +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
> > > +				  0, rxq->mp->socket_id);
> > > +	if (!mbufs)
> > > +		return -ENOMEM;
> > >  
> > 
> > 'mbufs' is temporarily storage for allocated mbuf pointers, why not
> > allocate if from stack instead, can be faster and easier to manage:
> > "struct rte_mbuf *mbufs[count]"
> 
> That would introduce a variable length array.
> VLA's should be removed, they are not supported on Windows and many
> security tools flag them. The problem is that it makes the code brittle
> if count gets huge.

+1

> 
> But certainly regular calloc() or alloca() would work here.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 10:19   ` Ferruh Yigit
  2024-01-30 16:43     ` Stephen Hemminger
@ 2024-01-30 21:30     ` Long Li
  2024-01-30 22:34       ` Ferruh Yigit
  1 sibling, 1 reply; 25+ messages in thread
From: Long Li @ 2024-01-30 21:30 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev

> Can you please quantify the performance improvement (as percentage), this
> clarifies the impact of the modification.

I didn't see any meaningful performance improvements in benchmarks. However, this should improve CPU cycles and reduce potential locking conflicts in real-world applications. 

Using batch allocation was one of the review comments during initial driver submission, suggested by Stephen Hemminger. I promised to fix it at that time. Sorry it took a while to submit this patch.

> 
> <...>
> 
> > @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq
> *rxq)
> >   * Post work requests for a Rx queue.
> >   */
> >  static int
> > -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> > +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
> >  {
> >  	int ret;
> >  	uint32_t i;
> > +	struct rte_mbuf **mbufs;
> > +
> > +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct
> rte_mbuf *),
> > +				  0, rxq->mp->socket_id);
> > +	if (!mbufs)
> > +		return -ENOMEM;
> >
> 
> 'mbufs' is temporarily storage for allocated mbuf pointers, why not allocate if from
> stack instead, can be faster and easier to manage:
> "struct rte_mbuf *mbufs[count]"
> 
> 
> > +
> > +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> > +	if (ret) {
> > +		DP_LOG(ERR, "failed to allocate mbufs for RX");
> > +		rxq->stats.nombuf += count;
> > +		goto fail;
> > +	}
> >
> >  #ifdef RTE_ARCH_32
> >  	rxq->wqe_cnt_to_short_db = 0;
> >  #endif
> > -	for (i = 0; i < rxq->num_desc; i++) {
> > -		ret = mana_alloc_and_post_rx_wqe(rxq);
> > +	for (i = 0; i < count; i++) {
> > +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
> >  		if (ret) {
> >  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
> > -			return ret;
> > +			goto fail;
> >
> 
> This may leak memory. There are allocated mbufs, if exit from loop here and free
> 'mubfs' variable, how remaining mubfs will be freed?

Mbufs are always freed after fail:

fail:
        rte_free(mbufs);

> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 21:30     ` Long Li
@ 2024-01-30 22:34       ` Ferruh Yigit
  2024-01-30 22:36         ` Long Li
  0 siblings, 1 reply; 25+ messages in thread
From: Ferruh Yigit @ 2024-01-30 22:34 UTC (permalink / raw)
  To: Long Li, Andrew Rybchenko; +Cc: dev

On 1/30/2024 9:30 PM, Long Li wrote:
>> Can you please quantify the performance improvement (as percentage), this
>> clarifies the impact of the modification.
> 
> I didn't see any meaningful performance improvements in benchmarks. However, this should improve CPU cycles and reduce potential locking conflicts in real-world applications. 
> 
> Using batch allocation was one of the review comments during initial driver submission, suggested by Stephen Hemminger. I promised to fix it at that time. Sorry it took a while to submit this patch.
> 

That is OK, using bulk alloc is reasonable approach, only can you please
document the impact (performance increase) in the commit log.

>>
>> <...>
>>
>>> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq
>> *rxq)
>>>   * Post work requests for a Rx queue.
>>>   */
>>>  static int
>>> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
>>> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>>>  {
>>>  	int ret;
>>>  	uint32_t i;
>>> +	struct rte_mbuf **mbufs;
>>> +
>>> +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct
>> rte_mbuf *),
>>> +				  0, rxq->mp->socket_id);
>>> +	if (!mbufs)
>>> +		return -ENOMEM;
>>>
>>
>> 'mbufs' is temporarily storage for allocated mbuf pointers, why not allocate if from
>> stack instead, can be faster and easier to manage:
>> "struct rte_mbuf *mbufs[count]"
>>
>>
>>> +
>>> +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
>>> +	if (ret) {
>>> +		DP_LOG(ERR, "failed to allocate mbufs for RX");
>>> +		rxq->stats.nombuf += count;
>>> +		goto fail;
>>> +	}
>>>
>>>  #ifdef RTE_ARCH_32
>>>  	rxq->wqe_cnt_to_short_db = 0;
>>>  #endif
>>> -	for (i = 0; i < rxq->num_desc; i++) {
>>> -		ret = mana_alloc_and_post_rx_wqe(rxq);
>>> +	for (i = 0; i < count; i++) {
>>> +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
>>>  		if (ret) {
>>>  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
>>> -			return ret;
>>> +			goto fail;
>>>
>>
>> This may leak memory. There are allocated mbufs, if exit from loop here and free
>> 'mubfs' variable, how remaining mubfs will be freed?
> 
> Mbufs are always freed after fail:
> 
> fail:
>         rte_free(mbufs);
> 

Nope, I am not talking about the 'mbufs' variable, I am talking about
mbuf pointers stored in the 'mbufs' array which are allocated by
'rte_pktmbuf_alloc_bulk()'.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 22:34       ` Ferruh Yigit
@ 2024-01-30 22:36         ` Long Li
  0 siblings, 0 replies; 25+ messages in thread
From: Long Li @ 2024-01-30 22:36 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev

> Subject: Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX
> WQEs
> 
> On 1/30/2024 9:30 PM, Long Li wrote:
> >> Can you please quantify the performance improvement (as percentage),
> >> this clarifies the impact of the modification.
> >
> > I didn't see any meaningful performance improvements in benchmarks.
> However, this should improve CPU cycles and reduce potential locking conflicts in
> real-world applications.
> >
> > Using batch allocation was one of the review comments during initial driver
> submission, suggested by Stephen Hemminger. I promised to fix it at that time.
> Sorry it took a while to submit this patch.
> >
> 
> That is OK, using bulk alloc is reasonable approach, only can you please document
> the impact (performance increase) in the commit log.

Will do that.

> 
> >>
> >> <...>
> >>
> >>> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq
> >> *rxq)
> >>>   * Post work requests for a Rx queue.
> >>>   */
> >>>  static int
> >>> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> >>> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
> >>>  {
> >>>  	int ret;
> >>>  	uint32_t i;
> >>> +	struct rte_mbuf **mbufs;
> >>> +
> >>> +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct
> >> rte_mbuf *),
> >>> +				  0, rxq->mp->socket_id);
> >>> +	if (!mbufs)
> >>> +		return -ENOMEM;
> >>>
> >>
> >> 'mbufs' is temporarily storage for allocated mbuf pointers, why not
> >> allocate if from stack instead, can be faster and easier to manage:
> >> "struct rte_mbuf *mbufs[count]"
> >>
> >>
> >>> +
> >>> +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> >>> +	if (ret) {
> >>> +		DP_LOG(ERR, "failed to allocate mbufs for RX");
> >>> +		rxq->stats.nombuf += count;
> >>> +		goto fail;
> >>> +	}
> >>>
> >>>  #ifdef RTE_ARCH_32
> >>>  	rxq->wqe_cnt_to_short_db = 0;
> >>>  #endif
> >>> -	for (i = 0; i < rxq->num_desc; i++) {
> >>> -		ret = mana_alloc_and_post_rx_wqe(rxq);
> >>> +	for (i = 0; i < count; i++) {
> >>> +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
> >>>  		if (ret) {
> >>>  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
> >>> -			return ret;
> >>> +			goto fail;
> >>>
> >>
> >> This may leak memory. There are allocated mbufs, if exit from loop
> >> here and free 'mubfs' variable, how remaining mubfs will be freed?
> >
> > Mbufs are always freed after fail:
> >
> > fail:
> >         rte_free(mbufs);
> >
> 
> Nope, I am not talking about the 'mbufs' variable, I am talking about mbuf
> pointers stored in the 'mbufs' array which are allocated by
> 'rte_pktmbuf_alloc_bulk()'.

You are right, I'm sending v3 to fix those.

Long


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 16:43     ` Stephen Hemminger
  2024-01-30 18:05       ` Tyler Retzlaff
@ 2024-01-30 22:42       ` Ferruh Yigit
  2024-02-01  3:55         ` Long Li
  1 sibling, 1 reply; 25+ messages in thread
From: Ferruh Yigit @ 2024-01-30 22:42 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: longli, Andrew Rybchenko, dev

On 1/30/2024 4:43 PM, Stephen Hemminger wrote:
> On Tue, 30 Jan 2024 10:19:32 +0000
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>>> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
>>> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>>>  {
>>>  	int ret;
>>>  	uint32_t i;
>>> +	struct rte_mbuf **mbufs;
>>> +
>>> +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
>>> +				  0, rxq->mp->socket_id);
>>> +	if (!mbufs)
>>> +		return -ENOMEM;
>>>  
>>
>> 'mbufs' is temporarily storage for allocated mbuf pointers, why not
>> allocate if from stack instead, can be faster and easier to manage:
>> "struct rte_mbuf *mbufs[count]"
> 
> That would introduce a variable length array.
> VLA's should be removed, they are not supported on Windows and many
> security tools flag them. The problem is that it makes the code brittle
> if count gets huge.
> 
> But certainly regular calloc() or alloca() would work here.
>

Most of the existing bulk alloc already uses VLA but I can see the
problem it is not being supported by Windows.

As this mbuf pointer array is short lived within the function, and being
in the fast path, I think continuous alloc and free can be prevented,

one option can be to define a fixed size, big enough, array which
requires additional loop for the cases 'count' size is bigger than array
size,

or an array can be allocated by driver init in device specific data ,as
we know it will be required continuously in the datapath, and it can be
freed during device close()/uninit().

I think an fixed size array from stack is easier and can be preferred.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Patch v3] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30  1:13 ` [Patch v2] " longli
  2024-01-30 10:19   ` Ferruh Yigit
@ 2024-02-01  3:45   ` longli
  2024-02-01 16:16     ` Tyler Retzlaff
  2024-02-02  1:19     ` [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs longli
  1 sibling, 2 replies; 25+ messages in thread
From: longli @ 2024-02-01  3:45 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev, Long Li

From: Long Li <longli@microsoft.com>

Instead of allocating mbufs one by one during RX, use
rte_pktmbuf_alloc_bulk() to allocate them in a batch.

There are no measurable performance improvements in benchmarks. However,
this patch should improve CPU cycles and reduce potential locking
conflicts in real-world applications.

Signed-off-by: Long Li <longli@microsoft.com>
---
Change in v2:
use rte_calloc_socket() in place of rte_calloc()

v3:
add more comment explaining the benefit of doing alloc_bulk.
free mbufs that are failed to post

 drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 24 deletions(-)

diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index acad5e26cd..6112db2219 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -2,6 +2,7 @@
  * Copyright 2022 Microsoft Corporation
  */
 #include <ethdev_driver.h>
+#include <rte_malloc.h>
 
 #include <infiniband/verbs.h>
 #include <infiniband/manadv.h>
@@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 }
 
 static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
 {
-	struct rte_mbuf *mbuf = NULL;
 	struct gdma_sgl_element sgl[1];
 	struct gdma_work_request request;
 	uint32_t wqe_size_in_bu;
@@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 	int ret;
 	struct mana_mr_cache *mr;
 
-	mbuf = rte_pktmbuf_alloc(rxq->mp);
-	if (!mbuf) {
-		rxq->stats.nombuf++;
-		return -ENOMEM;
-	}
-
 	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
 	if (!mr) {
 		DP_LOG(ERR, "failed to register RX MR");
@@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
  * Post work requests for a Rx queue.
  */
 static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
 {
 	int ret;
 	uint32_t i;
+	struct rte_mbuf **mbufs;
+
+	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
+				  0, rxq->mp->socket_id);
+	if (!mbufs)
+		return -ENOMEM;
+
+	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
+	if (ret) {
+		DP_LOG(ERR, "failed to allocate mbufs for RX");
+		rxq->stats.nombuf += count;
+		goto fail;
+	}
 
 #ifdef RTE_ARCH_32
 	rxq->wqe_cnt_to_short_db = 0;
 #endif
-	for (i = 0; i < rxq->num_desc; i++) {
-		ret = mana_alloc_and_post_rx_wqe(rxq);
+	for (i = 0; i < count; i++) {
+		ret = mana_post_rx_wqe(rxq, mbufs[i]);
 		if (ret) {
 			DP_LOG(ERR, "failed to post RX ret = %d", ret);
-			return ret;
+			break;
 		}
 
 #ifdef RTE_ARCH_32
@@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
 #endif
 	}
 
+	/* Free the remaining mbufs that are not posted */
+	while (i < count) {
+		rte_pktmbuf_free(mbufs[i]);
+		i++;
+	}
+
 	mana_rq_ring_doorbell(rxq);
 
+fail:
+	rte_free(mbufs);
 	return ret;
 }
 
@@ -404,7 +419,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < priv->num_queues; i++) {
-		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
 		if (ret)
 			goto fail;
 	}
@@ -423,7 +440,7 @@ uint16_t
 mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	uint16_t pkt_received = 0;
-	uint16_t wqe_posted = 0;
+	uint16_t wqe_consumed = 0;
 	struct mana_rxq *rxq = dpdk_rxq;
 	struct mana_priv *priv = rxq->priv;
 	struct rte_mbuf *mbuf;
@@ -535,18 +552,23 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
 
-		/* Consume this request and post another request */
-		ret = mana_alloc_and_post_rx_wqe(rxq);
-		if (ret) {
-			DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
-			break;
-		}
-
-		wqe_posted++;
+		/* Record the number of the RX WQE we need to post to replenish
+		 * consumed RX requests
+		 */
+		wqe_consumed++;
 		if (pkt_received == pkts_n)
 			break;
 
 #ifdef RTE_ARCH_32
+		/* Always post WQE as soon as it's consumed for short DB */
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+			return pkt_received;
+		}
+		wqe_consumed = 0;
+
 		/* Ring short doorbell if approaching the wqe increment
 		 * limit.
 		 */
@@ -569,8 +591,12 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		goto repoll;
 	}
 
-	if (wqe_posted)
-		mana_rq_ring_doorbell(rxq);
+	if (wqe_consumed) {
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret)
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+	}
 
 	return pkt_received;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-01-30 22:42       ` Ferruh Yigit
@ 2024-02-01  3:55         ` Long Li
  2024-02-01 10:52           ` Ferruh Yigit
  2024-02-01 16:33           ` Tyler Retzlaff
  0 siblings, 2 replies; 25+ messages in thread
From: Long Li @ 2024-02-01  3:55 UTC (permalink / raw)
  To: Ferruh Yigit, stephen; +Cc: Andrew Rybchenko, dev

> >> 'mbufs' is temporarily storage for allocated mbuf pointers, why not
> >> allocate if from stack instead, can be faster and easier to manage:
> >> "struct rte_mbuf *mbufs[count]"
> >
> > That would introduce a variable length array.
> > VLA's should be removed, they are not supported on Windows and many
> > security tools flag them. The problem is that it makes the code
> > brittle if count gets huge.
> >
> > But certainly regular calloc() or alloca() would work here.
> >
> 
> Most of the existing bulk alloc already uses VLA but I can see the problem it is not
> being supported by Windows.
> 
> As this mbuf pointer array is short lived within the function, and being in the fast
> path, I think continuous alloc and free can be prevented,
> 
> one option can be to define a fixed size, big enough, array which requires
> additional loop for the cases 'count' size is bigger than array size,
> 
> or an array can be allocated by driver init in device specific data ,as we know it
> will be required continuously in the datapath, and it can be freed during device
> close()/uninit().
> 
> I think an fixed size array from stack is easier and can be preferred.

I sent a v3 of the patch, still using alloc().

I found two problems with using a fixed array:
1. the array size needs to be determined in advance. I don't know what a good number should be. If too big, some of them may be wasted. (and maybe make a bigger mess of CPU cache) If too small, it ends up doing multiple allocations, which is the problem this patch trying to solve.
2. if makes the code slightly more complex ,but I think 1 is the main problem.

I think another approach is to use VLA by default, but for Windows use alloc().

Long

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-01  3:55         ` Long Li
@ 2024-02-01 10:52           ` Ferruh Yigit
  2024-02-02  1:21             ` Long Li
  2024-02-01 16:33           ` Tyler Retzlaff
  1 sibling, 1 reply; 25+ messages in thread
From: Ferruh Yigit @ 2024-02-01 10:52 UTC (permalink / raw)
  To: Long Li, stephen; +Cc: Andrew Rybchenko, dev

On 2/1/2024 3:55 AM, Long Li wrote:
>>>> 'mbufs' is temporarily storage for allocated mbuf pointers, why not
>>>> allocate if from stack instead, can be faster and easier to manage:
>>>> "struct rte_mbuf *mbufs[count]"
>>>
>>> That would introduce a variable length array.
>>> VLA's should be removed, they are not supported on Windows and many
>>> security tools flag them. The problem is that it makes the code
>>> brittle if count gets huge.
>>>
>>> But certainly regular calloc() or alloca() would work here.
>>>
>>
>> Most of the existing bulk alloc already uses VLA but I can see the problem it is not
>> being supported by Windows.
>>
>> As this mbuf pointer array is short lived within the function, and being in the fast
>> path, I think continuous alloc and free can be prevented,
>>
>> one option can be to define a fixed size, big enough, array which requires
>> additional loop for the cases 'count' size is bigger than array size,
>>
>> or an array can be allocated by driver init in device specific data ,as we know it
>> will be required continuously in the datapath, and it can be freed during device
>> close()/uninit().
>>
>> I think an fixed size array from stack is easier and can be preferred.
> 
> I sent a v3 of the patch, still using alloc().
> 
> I found two problems with using a fixed array:
> 1. the array size needs to be determined in advance. I don't know what a good number should be. If too big, some of them may be wasted. (and maybe make a bigger mess of CPU cache) If too small, it ends up doing multiple allocations, which is the problem this patch trying to solve.
>

I think default burst size 32 can be used like below:

struct rte_mbuf *mbufs[32];

loop: //use do {} while(); if you prefer
n = min(32, count);
rte_pktmbuf_alloc_bulk(mbufs, n);
for (i = 0; i < n; i++)
	mana_post_rx_wqe(rxq, mbufs[i]);
count -= n;
if (count > 0) goto loop:


This additional loop doesn't make code very complex (I think not more
than additional alloc() & free()) and it doesn't waste memory.
I suggest doing a performance measurement with above change, it may
increase performance,
afterwards if you insist to go with original code, we can do it.


> 2. if makes the code slightly more complex ,but I think 1 is the main problem.
> 
> I think another approach is to use VLA by default, but for Windows use alloc().
> 
> Long


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v3] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-01  3:45   ` [Patch v3] " longli
@ 2024-02-01 16:16     ` Tyler Retzlaff
  2024-02-01 19:41       ` Long Li
  2024-02-02  1:19     ` [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs longli
  1 sibling, 1 reply; 25+ messages in thread
From: Tyler Retzlaff @ 2024-02-01 16:16 UTC (permalink / raw)
  To: longli; +Cc: Ferruh Yigit, Andrew Rybchenko, dev

On Wed, Jan 31, 2024 at 07:45:50PM -0800, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
> 
> There are no measurable performance improvements in benchmarks. However,
> this patch should improve CPU cycles and reduce potential locking
> conflicts in real-world applications.
> 
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> Change in v2:
> use rte_calloc_socket() in place of rte_calloc()
> 
> v3:
> add more comment explaining the benefit of doing alloc_bulk.
> free mbufs that are failed to post
> 
>  drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
>  1 file changed, 50 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
> index acad5e26cd..6112db2219 100644
> --- a/drivers/net/mana/rx.c
> +++ b/drivers/net/mana/rx.c
> @@ -2,6 +2,7 @@
>   * Copyright 2022 Microsoft Corporation
>   */
>  #include <ethdev_driver.h>
> +#include <rte_malloc.h>
>  
>  #include <infiniband/verbs.h>
>  #include <infiniband/manadv.h>
> @@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
>  }
>  
>  static int
> -mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
> +mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
>  {
> -	struct rte_mbuf *mbuf = NULL;
>  	struct gdma_sgl_element sgl[1];
>  	struct gdma_work_request request;
>  	uint32_t wqe_size_in_bu;
> @@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>  	int ret;
>  	struct mana_mr_cache *mr;
>  
> -	mbuf = rte_pktmbuf_alloc(rxq->mp);
> -	if (!mbuf) {
> -		rxq->stats.nombuf++;
> -		return -ENOMEM;
> -	}
> -
>  	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
>  	if (!mr) {
>  		DP_LOG(ERR, "failed to register RX MR");
> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>   * Post work requests for a Rx queue.
>   */
>  static int
> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>  {
>  	int ret;
>  	uint32_t i;
> +	struct rte_mbuf **mbufs;
> +
> +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
> +				  0, rxq->mp->socket_id);
> +	if (!mbufs)
> +		return -ENOMEM;
> +
> +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> +	if (ret) {
> +		DP_LOG(ERR, "failed to allocate mbufs for RX");
> +		rxq->stats.nombuf += count;
> +		goto fail;
> +	}
>  
>  #ifdef RTE_ARCH_32
>  	rxq->wqe_cnt_to_short_db = 0;
>  #endif
> -	for (i = 0; i < rxq->num_desc; i++) {
> -		ret = mana_alloc_and_post_rx_wqe(rxq);
> +	for (i = 0; i < count; i++) {
> +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
>  		if (ret) {
>  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
> -			return ret;
> +			break;
>  		}
>  
>  #ifdef RTE_ARCH_32
> @@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
>  #endif
>  	}
>  
> +	/* Free the remaining mbufs that are not posted */
> +	while (i < count) {
> +		rte_pktmbuf_free(mbufs[i]);
> +		i++;
> +	}

there is also rte_pktmbuf_free_bulk() that could be used. probably won't
make any material difference to perf though so just an fyi.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-01  3:55         ` Long Li
  2024-02-01 10:52           ` Ferruh Yigit
@ 2024-02-01 16:33           ` Tyler Retzlaff
  2024-02-02  1:22             ` Long Li
  1 sibling, 1 reply; 25+ messages in thread
From: Tyler Retzlaff @ 2024-02-01 16:33 UTC (permalink / raw)
  To: Long Li; +Cc: Ferruh Yigit, stephen, Andrew Rybchenko, dev

On Thu, Feb 01, 2024 at 03:55:55AM +0000, Long Li wrote:
> > >> 'mbufs' is temporarily storage for allocated mbuf pointers, why not
> > >> allocate if from stack instead, can be faster and easier to manage:
> > >> "struct rte_mbuf *mbufs[count]"
> > >
> > > That would introduce a variable length array.
> > > VLA's should be removed, they are not supported on Windows and many
> > > security tools flag them. The problem is that it makes the code
> > > brittle if count gets huge.
> > >
> > > But certainly regular calloc() or alloca() would work here.
> > >
> > 
> > Most of the existing bulk alloc already uses VLA but I can see the problem it is not
> > being supported by Windows.
> > 
> > As this mbuf pointer array is short lived within the function, and being in the fast
> > path, I think continuous alloc and free can be prevented,
> > 
> > one option can be to define a fixed size, big enough, array which requires
> > additional loop for the cases 'count' size is bigger than array size,
> > 
> > or an array can be allocated by driver init in device specific data ,as we know it
> > will be required continuously in the datapath, and it can be freed during device
> > close()/uninit().
> > 
> > I think an fixed size array from stack is easier and can be preferred.
> 
> I sent a v3 of the patch, still using alloc().
> 
> I found two problems with using a fixed array:
> 1. the array size needs to be determined in advance. I don't know what a good number should be. If too big, some of them may be wasted. (and maybe make a bigger mess of CPU cache) If too small, it ends up doing multiple allocations, which is the problem this patch trying to solve.
> 2. if makes the code slightly more complex ,but I think 1 is the main problem.
> 
> I think another approach is to use VLA by default, but for Windows use alloc().

a few thoughts on VLAs you may consider. not to be regarded as a strong
objection.

indications are that standard C will gradually phase out VLAs because
they're generally accepted as having been a bad idea. that said
compilers that implement them will probably keep them forever.

VLAs generate a lot of code relative to just using a more permanent
allocation. may not show up in your performance tests but you also may
not want it on your hotpath either.

mana doesn't currently support windows, are there plans to support
windows? if never then i suppose VLAs can be used since all the
toolchains you care about have them. though it does raise the bar, cause
more work, later refactor, carry regression risk should you change your
mind and choose to port to windows.

accepting the use of VLAs anywhere in dpdk prohibits general
checkpatches and/or compiling with compiler options that detect and flag
their inclusion as a part of the CI without having to add exclusion
logic for drivers that are allowed to use them.

> 
> Long

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v3] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-01 16:16     ` Tyler Retzlaff
@ 2024-02-01 19:41       ` Long Li
  0 siblings, 0 replies; 25+ messages in thread
From: Long Li @ 2024-02-01 19:41 UTC (permalink / raw)
  To: Tyler Retzlaff; +Cc: Ferruh Yigit, Andrew Rybchenko, dev

> > +	/* Free the remaining mbufs that are not posted */
> > +	while (i < count) {
> > +		rte_pktmbuf_free(mbufs[i]);
> > +		i++;
> > +	}
> 
> there is also rte_pktmbuf_free_bulk() that could be used. probably won't make
> any material difference to perf though so just an fyi.

Thank you! Will use rte_pktmbuf_free_bulk().

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs
  2024-02-01  3:45   ` [Patch v3] " longli
  2024-02-01 16:16     ` Tyler Retzlaff
@ 2024-02-02  1:19     ` longli
  2024-02-02 16:24       ` Stephen Hemminger
                         ` (2 more replies)
  1 sibling, 3 replies; 25+ messages in thread
From: longli @ 2024-02-02  1:19 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev, Long Li

From: Long Li <longli@microsoft.com>

Instead of allocating mbufs one by one during RX, use
rte_pktmbuf_alloc_bulk() to allocate them in a batch.

With this patch, there are no measurable performance improvements in
benchmarks. However, this patch should improve CPU cycles and reduce
potential locking conflicts in real-world applications.

Signed-off-by: Long Li <longli@microsoft.com>
---
Change in v2:
use rte_calloc_socket() in place of rte_calloc()

v3:
add more comment explaining the benefit of doing alloc_bulk.
free mbufs that are failed to post

v4:
replace rte_calloc_socket() with a fixed array on the stack

 drivers/net/mana/rx.c | 76 ++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index acad5e26cd..4fc0f789d8 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -59,9 +59,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 }
 
 static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
 {
-	struct rte_mbuf *mbuf = NULL;
 	struct gdma_sgl_element sgl[1];
 	struct gdma_work_request request;
 	uint32_t wqe_size_in_bu;
@@ -69,12 +68,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 	int ret;
 	struct mana_mr_cache *mr;
 
-	mbuf = rte_pktmbuf_alloc(rxq->mp);
-	if (!mbuf) {
-		rxq->stats.nombuf++;
-		return -ENOMEM;
-	}
-
 	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
 	if (!mr) {
 		DP_LOG(ERR, "failed to register RX MR");
@@ -120,20 +113,33 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 /*
  * Post work requests for a Rx queue.
  */
+#define MANA_MBUF_BULK 32u
 static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
 {
 	int ret;
-	uint32_t i;
+	uint32_t i, batch_count;
+	struct rte_mbuf *mbufs[MANA_MBUF_BULK];
+
+more_mbufs:
+	batch_count = RTE_MIN(count, MANA_MBUF_BULK);
+	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, batch_count);
+	if (ret) {
+		DP_LOG(ERR, "failed to allocate mbufs for RX");
+		rxq->stats.nombuf += count;
+
+		/* Bail out to ring doorbell for posted packets */
+		goto out;
+	}
 
 #ifdef RTE_ARCH_32
 	rxq->wqe_cnt_to_short_db = 0;
 #endif
-	for (i = 0; i < rxq->num_desc; i++) {
-		ret = mana_alloc_and_post_rx_wqe(rxq);
+	for (i = 0; i < batch_count; i++) {
+		ret = mana_post_rx_wqe(rxq, mbufs[i]);
 		if (ret) {
 			DP_LOG(ERR, "failed to post RX ret = %d", ret);
-			return ret;
+			break;
 		}
 
 #ifdef RTE_ARCH_32
@@ -144,8 +150,15 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
 #endif
 	}
 
-	mana_rq_ring_doorbell(rxq);
+	/* Free the remaining mbufs that are not posted */
+	rte_pktmbuf_free_bulk(&mbufs[i], batch_count - i);
+
+	count -= batch_count;
+	if (count > 0)
+		goto more_mbufs;
 
+out:
+	mana_rq_ring_doorbell(rxq);
 	return ret;
 }
 
@@ -404,7 +417,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < priv->num_queues; i++) {
-		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
 		if (ret)
 			goto fail;
 	}
@@ -423,7 +438,7 @@ uint16_t
 mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	uint16_t pkt_received = 0;
-	uint16_t wqe_posted = 0;
+	uint16_t wqe_consumed = 0;
 	struct mana_rxq *rxq = dpdk_rxq;
 	struct mana_priv *priv = rxq->priv;
 	struct rte_mbuf *mbuf;
@@ -535,18 +550,23 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
 
-		/* Consume this request and post another request */
-		ret = mana_alloc_and_post_rx_wqe(rxq);
-		if (ret) {
-			DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
-			break;
-		}
-
-		wqe_posted++;
+		/* Record the number of the RX WQE we need to post to replenish
+		 * consumed RX requests
+		 */
+		wqe_consumed++;
 		if (pkt_received == pkts_n)
 			break;
 
 #ifdef RTE_ARCH_32
+		/* Always post WQE as soon as it's consumed for short DB */
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+			return pkt_received;
+		}
+		wqe_consumed = 0;
+
 		/* Ring short doorbell if approaching the wqe increment
 		 * limit.
 		 */
@@ -569,8 +589,12 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		goto repoll;
 	}
 
-	if (wqe_posted)
-		mana_rq_ring_doorbell(rxq);
+	if (wqe_consumed) {
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret)
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+	}
 
 	return pkt_received;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-01 10:52           ` Ferruh Yigit
@ 2024-02-02  1:21             ` Long Li
  0 siblings, 0 replies; 25+ messages in thread
From: Long Li @ 2024-02-02  1:21 UTC (permalink / raw)
  To: Ferruh Yigit, stephen; +Cc: Andrew Rybchenko, dev

> I think default burst size 32 can be used like below:
> 
> struct rte_mbuf *mbufs[32];
> 
> loop: //use do {} while(); if you prefer n = min(32, count);
> rte_pktmbuf_alloc_bulk(mbufs, n); for (i = 0; i < n; i++)
> 	mana_post_rx_wqe(rxq, mbufs[i]);
> count -= n;
> if (count > 0) goto loop:
> 
> 
> This additional loop doesn't make code very complex (I think not more than
> additional alloc() & free()) and it doesn't waste memory.
> I suggest doing a performance measurement with above change, it may increase
> performance, afterwards if you insist to go with original code, we can do it.
> 

I submitted v4 with your suggestions. The code doesn't end up looking very messy. I measured the same performance with and without the patch.

Thanks,

Long

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v2] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-01 16:33           ` Tyler Retzlaff
@ 2024-02-02  1:22             ` Long Li
  0 siblings, 0 replies; 25+ messages in thread
From: Long Li @ 2024-02-02  1:22 UTC (permalink / raw)
  To: Tyler Retzlaff; +Cc: Ferruh Yigit, stephen, Andrew Rybchenko, dev


> > I think another approach is to use VLA by default, but for Windows use alloc().
> 
> a few thoughts on VLAs you may consider. not to be regarded as a strong
> objection.
> 
> indications are that standard C will gradually phase out VLAs because they're
> generally accepted as having been a bad idea. that said compilers that implement
> them will probably keep them forever.
> 
> VLAs generate a lot of code relative to just using a more permanent allocation.
> may not show up in your performance tests but you also may not want it on your
> hotpath either.
> 
> mana doesn't currently support windows, are there plans to support windows? if
> never then i suppose VLAs can be used since all the toolchains you care about
> have them. though it does raise the bar, cause more work, later refactor, carry
> regression risk should you change your mind and choose to port to windows.
> 
> accepting the use of VLAs anywhere in dpdk prohibits general checkpatches
> and/or compiling with compiler options that detect and flag their inclusion as a
> part of the CI without having to add exclusion logic for drivers that are allowed to
> use them.
> 

I agree we need to keep the code consistent. I submitted v4 using fixed array.

Thanks,

Long

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs
  2024-02-02  1:19     ` [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs longli
@ 2024-02-02 16:24       ` Stephen Hemminger
  2024-02-06 18:06       ` Ferruh Yigit
  2024-02-09  0:02       ` [Patch v5] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
  2 siblings, 0 replies; 25+ messages in thread
From: Stephen Hemminger @ 2024-02-02 16:24 UTC (permalink / raw)
  To: longli; +Cc: longli, Ferruh Yigit, Andrew Rybchenko, dev

On Thu,  1 Feb 2024 17:19:21 -0800
longli@linuxonhyperv.com wrote:

> From: Long Li <longli@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
> 
> With this patch, there are no measurable performance improvements in
> benchmarks. However, this patch should improve CPU cycles and reduce
> potential locking conflicts in real-world applications.
> 
> Signed-off-by: Long Li <longli@microsoft.com>

Acked-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs
  2024-02-02  1:19     ` [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs longli
  2024-02-02 16:24       ` Stephen Hemminger
@ 2024-02-06 18:06       ` Ferruh Yigit
  2024-02-07  4:50         ` Long Li
  2024-02-09  0:02       ` [Patch v5] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
  2 siblings, 1 reply; 25+ messages in thread
From: Ferruh Yigit @ 2024-02-06 18:06 UTC (permalink / raw)
  To: longli, Andrew Rybchenko; +Cc: dev, Stephen Hemminger

On 2/2/2024 1:19 AM, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
> 
> With this patch, there are no measurable performance improvements in
> benchmarks. However, this patch should improve CPU cycles and reduce
> potential locking conflicts in real-world applications.
> 
> Signed-off-by: Long Li <longli@microsoft.com>

<...>

> @@ -120,20 +113,33 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>  /*
>   * Post work requests for a Rx queue.
>   */
> +#define MANA_MBUF_BULK 32u
>  static int
> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>  {
>  	int ret;
> -	uint32_t i;
> +	uint32_t i, batch_count;
> +	struct rte_mbuf *mbufs[MANA_MBUF_BULK];
> +
> +more_mbufs:
> +	batch_count = RTE_MIN(count, MANA_MBUF_BULK);
> +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, batch_count);
> +	if (ret) {
> +		DP_LOG(ERR, "failed to allocate mbufs for RX");
> +		rxq->stats.nombuf += count;
> +
> +		/* Bail out to ring doorbell for posted packets */
> +		goto out;
> +	}
>  
>  #ifdef RTE_ARCH_32
>  	rxq->wqe_cnt_to_short_db = 0;
>  #endif
> -	for (i = 0; i < rxq->num_desc; i++) {
> -		ret = mana_alloc_and_post_rx_wqe(rxq);
> +	for (i = 0; i < batch_count; i++) {
> +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
>  		if (ret) {
>  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
> -			return ret;
> +			break;
>

Hi Long,

Assume that if "count > MANA_MBUF_BULK", and int the first iteration of
the loop 'mana_post_rx_wqe()' failed, but in second iteration it is
successful, this will cause function to return success in spite of
failure in first iteration.

As mbufs posted Rx queue, it may be OK to consider above case as
success, but since 'count' number not posted this may be misleading.
I just want to double check if this is done intentionally.


With the limitation of VLA code become more complex, and if there is no
performance benefit of the allocating 'mbufs' array from stack, you may
prefer to switch back to allocating dynamic memory, up to you.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs
  2024-02-06 18:06       ` Ferruh Yigit
@ 2024-02-07  4:50         ` Long Li
  0 siblings, 0 replies; 25+ messages in thread
From: Long Li @ 2024-02-07  4:50 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev, stephen

> Hi Long,
> 
> Assume that if "count > MANA_MBUF_BULK", and int the first iteration of the
> loop 'mana_post_rx_wqe()' failed, but in second iteration it is successful, this
> will cause function to return success in spite of failure in first iteration.
> 
> As mbufs posted Rx queue, it may be OK to consider above case as success,
> but since 'count' number not posted this may be misleading.
> I just want to double check if this is done intentionally.

You are correct about the first loop may fail and second may succeed.

I'm sending v5 to fail the queue when this happens. This is fatal and we should stop the queue.

Thanks,

Long

> 
> 
> With the limitation of VLA code become more complex, and if there is no
> performance benefit of the allocating 'mbufs' array from stack, you may
> prefer to switch back to allocating dynamic memory, up to you.
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Patch v5] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-02  1:19     ` [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs longli
  2024-02-02 16:24       ` Stephen Hemminger
  2024-02-06 18:06       ` Ferruh Yigit
@ 2024-02-09  0:02       ` longli
  2024-02-09 17:46         ` Ferruh Yigit
  2 siblings, 1 reply; 25+ messages in thread
From: longli @ 2024-02-09  0:02 UTC (permalink / raw)
  To: Ferruh Yigit, Andrew Rybchenko; +Cc: dev, Long Li

From: Long Li <longli@microsoft.com>

Instead of allocating mbufs one by one during RX, use
rte_pktmbuf_alloc_bulk() to allocate them in a batch.

With this patch, there are no measurable performance improvements in
benchmarks. However, this patch should improve CPU cycles and reduce
potential locking conflicts in real-world applications.

Signed-off-by: Long Li <longli@microsoft.com>
---
Change in v2:
use rte_calloc_socket() in place of rte_calloc()

v3:
add more comment explaining the benefit of doing alloc_bulk.
free mbufs that are failed to post

v4:
replace rte_calloc_socket() with a fixed array on the stack

v5:
fail the function if any WQE posting to queue fails

 drivers/net/mana/rx.c | 76 ++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index acad5e26cd..16e647baf5 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -59,9 +59,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 }
 
 static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
 {
-	struct rte_mbuf *mbuf = NULL;
 	struct gdma_sgl_element sgl[1];
 	struct gdma_work_request request;
 	uint32_t wqe_size_in_bu;
@@ -69,12 +68,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 	int ret;
 	struct mana_mr_cache *mr;
 
-	mbuf = rte_pktmbuf_alloc(rxq->mp);
-	if (!mbuf) {
-		rxq->stats.nombuf++;
-		return -ENOMEM;
-	}
-
 	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
 	if (!mr) {
 		DP_LOG(ERR, "failed to register RX MR");
@@ -120,20 +113,36 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 /*
  * Post work requests for a Rx queue.
  */
+#define MANA_MBUF_BULK 32u
 static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
 {
 	int ret;
-	uint32_t i;
+	uint32_t i, batch_count;
+	struct rte_mbuf *mbufs[MANA_MBUF_BULK];
+
+more_mbufs:
+	batch_count = RTE_MIN(count, MANA_MBUF_BULK);
+	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, batch_count);
+	if (ret) {
+		DP_LOG(ERR, "failed to allocate mbufs for RX");
+		rxq->stats.nombuf += count;
+
+		/* Bail out to ring doorbell for posted packets */
+		goto out;
+	}
 
 #ifdef RTE_ARCH_32
 	rxq->wqe_cnt_to_short_db = 0;
 #endif
-	for (i = 0; i < rxq->num_desc; i++) {
-		ret = mana_alloc_and_post_rx_wqe(rxq);
+	for (i = 0; i < batch_count; i++) {
+		ret = mana_post_rx_wqe(rxq, mbufs[i]);
 		if (ret) {
 			DP_LOG(ERR, "failed to post RX ret = %d", ret);
-			return ret;
+
+			/* Free the remaining mbufs that are not posted */
+			rte_pktmbuf_free_bulk(&mbufs[i], batch_count - i);
+			goto out;
 		}
 
 #ifdef RTE_ARCH_32
@@ -144,8 +153,12 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
 #endif
 	}
 
-	mana_rq_ring_doorbell(rxq);
+	count -= batch_count;
+	if (count > 0)
+		goto more_mbufs;
 
+out:
+	mana_rq_ring_doorbell(rxq);
 	return ret;
 }
 
@@ -404,7 +417,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < priv->num_queues; i++) {
-		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
 		if (ret)
 			goto fail;
 	}
@@ -423,7 +438,7 @@ uint16_t
 mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	uint16_t pkt_received = 0;
-	uint16_t wqe_posted = 0;
+	uint16_t wqe_consumed = 0;
 	struct mana_rxq *rxq = dpdk_rxq;
 	struct mana_priv *priv = rxq->priv;
 	struct rte_mbuf *mbuf;
@@ -535,18 +550,23 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
 
-		/* Consume this request and post another request */
-		ret = mana_alloc_and_post_rx_wqe(rxq);
-		if (ret) {
-			DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
-			break;
-		}
-
-		wqe_posted++;
+		/* Record the number of the RX WQE we need to post to replenish
+		 * consumed RX requests
+		 */
+		wqe_consumed++;
 		if (pkt_received == pkts_n)
 			break;
 
 #ifdef RTE_ARCH_32
+		/* Always post WQE as soon as it's consumed for short DB */
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+			return pkt_received;
+		}
+		wqe_consumed = 0;
+
 		/* Ring short doorbell if approaching the wqe increment
 		 * limit.
 		 */
@@ -569,8 +589,12 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		goto repoll;
 	}
 
-	if (wqe_posted)
-		mana_rq_ring_doorbell(rxq);
+	if (wqe_consumed) {
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret)
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+	}
 
 	return pkt_received;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Patch v5] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
  2024-02-09  0:02       ` [Patch v5] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
@ 2024-02-09 17:46         ` Ferruh Yigit
  0 siblings, 0 replies; 25+ messages in thread
From: Ferruh Yigit @ 2024-02-09 17:46 UTC (permalink / raw)
  To: longli, Andrew Rybchenko; +Cc: dev

On 2/9/2024 12:02 AM, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
> 
> With this patch, there are no measurable performance improvements in
> benchmarks. However, this patch should improve CPU cycles and reduce
> potential locking conflicts in real-world applications.
> 
> Signed-off-by: Long Li <longli@microsoft.com>
>

Applied to dpdk-next-net/main, thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2024-02-09 17:46 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-25  2:42 [PATCH] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
2024-01-26  0:29 ` Stephen Hemminger
2024-01-26  1:13   ` Long Li
2024-01-30  1:13 ` [Patch v2] " longli
2024-01-30 10:19   ` Ferruh Yigit
2024-01-30 16:43     ` Stephen Hemminger
2024-01-30 18:05       ` Tyler Retzlaff
2024-01-30 22:42       ` Ferruh Yigit
2024-02-01  3:55         ` Long Li
2024-02-01 10:52           ` Ferruh Yigit
2024-02-02  1:21             ` Long Li
2024-02-01 16:33           ` Tyler Retzlaff
2024-02-02  1:22             ` Long Li
2024-01-30 21:30     ` Long Li
2024-01-30 22:34       ` Ferruh Yigit
2024-01-30 22:36         ` Long Li
2024-02-01  3:45   ` [Patch v3] " longli
2024-02-01 16:16     ` Tyler Retzlaff
2024-02-01 19:41       ` Long Li
2024-02-02  1:19     ` [Patch v4] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX mbufs longli
2024-02-02 16:24       ` Stephen Hemminger
2024-02-06 18:06       ` Ferruh Yigit
2024-02-07  4:50         ` Long Li
2024-02-09  0:02       ` [Patch v5] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs longli
2024-02-09 17:46         ` Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).