* [PATCH] net/i40e: Fast release optimizations
@ 2025-06-24 6:12 Morten Brørup
2025-06-25 10:47 ` Bruce Richardson
2025-06-30 11:40 ` Konstantin Ananyev
0 siblings, 2 replies; 6+ messages in thread
From: Morten Brørup @ 2025-06-24 6:12 UTC (permalink / raw)
To: Bruce Richardson, dev; +Cc: Morten Brørup
When fast releasing mbufs, the mbufs are not accessed, so do not prefetch
them.
This saves a mbuf load operation for each fast released TX mbuf.
When fast release of mbufs is enabled for a TX queue, cache the mbuf
mempool pointer in the TX queue structure.
This saves one mbuf load operation for each burst of fast released TX
mbufs.
The txep->mbuf pointer is not used after the mbuf has been freed, so do
not reset the pointer.
This saves a txep store operation for each TX mbuf freed.
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
drivers/net/intel/common/tx.h | 5 +++
.../i40e/i40e_recycle_mbufs_vec_common.c | 4 +-
drivers/net/intel/i40e/i40e_rxtx.c | 39 ++++++++++---------
3 files changed, 28 insertions(+), 20 deletions(-)
diff --git a/drivers/net/intel/common/tx.h b/drivers/net/intel/common/tx.h
index b0a68bae44..54c9b845f7 100644
--- a/drivers/net/intel/common/tx.h
+++ b/drivers/net/intel/common/tx.h
@@ -62,6 +62,11 @@ struct ci_tx_queue {
uint16_t tx_next_dd;
uint16_t tx_next_rs;
uint64_t offloads;
+ /* Mempool pointer for fast release of mbufs.
+ * NULL if disabled, UINTPTR_MAX if enabled and not yet known.
+ * Initialized at first use.
+ */
+ struct rte_mempool *fast_free_mp;
uint64_t mbuf_errors;
rte_iova_t tx_ring_dma; /* TX ring DMA address */
bool tx_deferred_start; /* don't start this queue in dev start */
diff --git a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
index 2875c578af..a46605cee9 100644
--- a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
+++ b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
@@ -106,7 +106,9 @@ i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue,
if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
/* Avoid txq contains buffers from unexpected mempool. */
if (unlikely(recycle_rxq_info->mp
- != txep[0].mbuf->pool))
+ != (likely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
+ txq->fast_free_mp :
+ (txq->fast_free_mp = txep[0].mbuf->pool))))
return 0;
/* Directly put mbufs from Tx to Rx. */
diff --git a/drivers/net/intel/i40e/i40e_rxtx.c b/drivers/net/intel/i40e/i40e_rxtx.c
index c3ff2e05c3..679c1340b8 100644
--- a/drivers/net/intel/i40e/i40e_rxtx.c
+++ b/drivers/net/intel/i40e/i40e_rxtx.c
@@ -1332,7 +1332,7 @@ static __rte_always_inline int
i40e_tx_free_bufs(struct ci_tx_queue *txq)
{
struct ci_tx_entry *txep;
- uint16_t tx_rs_thresh = txq->tx_rs_thresh;
+ const uint16_t tx_rs_thresh = txq->tx_rs_thresh;
uint16_t i = 0, j = 0;
struct rte_mbuf *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
const uint16_t k = RTE_ALIGN_FLOOR(tx_rs_thresh, RTE_I40E_TX_MAX_FREE_BUF_SZ);
@@ -1345,41 +1345,40 @@ i40e_tx_free_bufs(struct ci_tx_queue *txq)
txep = &txq->sw_ring[txq->tx_next_dd - (tx_rs_thresh - 1)];
- for (i = 0; i < tx_rs_thresh; i++)
- rte_prefetch0((txep + i)->mbuf);
-
if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+ struct rte_mempool * const fast_free_mp =
+ likely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
+ txq->fast_free_mp :
+ (txq->fast_free_mp = txep[0].mbuf->pool);
+
if (k) {
for (j = 0; j != k; j += RTE_I40E_TX_MAX_FREE_BUF_SZ) {
- for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ; ++i, ++txep) {
+ for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ; ++i, ++txep)
free[i] = txep->mbuf;
- txep->mbuf = NULL;
- }
- rte_mempool_put_bulk(free[0]->pool, (void **)free,
+ rte_mempool_put_bulk(fast_free_mp, (void **)free,
RTE_I40E_TX_MAX_FREE_BUF_SZ);
}
}
if (m) {
- for (i = 0; i < m; ++i, ++txep) {
+ for (i = 0; i < m; ++i, ++txep)
free[i] = txep->mbuf;
- txep->mbuf = NULL;
- }
- rte_mempool_put_bulk(free[0]->pool, (void **)free, m);
+ rte_mempool_put_bulk(fast_free_mp, (void **)free, m);
}
} else {
- for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
+ for (i = 0; i < tx_rs_thresh; i++)
+ rte_prefetch0((txep + i)->mbuf);
+
+ for (i = 0; i < tx_rs_thresh; ++i, ++txep)
rte_pktmbuf_free_seg(txep->mbuf);
- txep->mbuf = NULL;
- }
}
- txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
- txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + tx_rs_thresh);
if (txq->tx_next_dd >= txq->nb_tx_desc)
- txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+ txq->tx_next_dd = (uint16_t)(tx_rs_thresh - 1);
- return txq->tx_rs_thresh;
+ return tx_rs_thresh;
}
/* Populate 4 descriptors with data from 4 mbufs */
@@ -2546,6 +2545,8 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev *dev,
txq->reg_idx = reg_idx;
txq->port_id = dev->data->port_id;
txq->offloads = offloads;
+ txq->fast_free_mp = offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE ?
+ (void *)UINTPTR_MAX : NULL;
txq->i40e_vsi = vsi;
txq->tx_deferred_start = tx_conf->tx_deferred_start;
--
2.43.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] net/i40e: Fast release optimizations
2025-06-24 6:12 [PATCH] net/i40e: Fast release optimizations Morten Brørup
@ 2025-06-25 10:47 ` Bruce Richardson
2025-06-25 11:15 ` Morten Brørup
2025-06-30 11:40 ` Konstantin Ananyev
1 sibling, 1 reply; 6+ messages in thread
From: Bruce Richardson @ 2025-06-25 10:47 UTC (permalink / raw)
To: Morten Brørup; +Cc: dev
On Tue, Jun 24, 2025 at 06:12:38AM +0000, Morten Brørup wrote:
> When fast releasing mbufs, the mbufs are not accessed, so do not prefetch
> them.
> This saves a mbuf load operation for each fast released TX mbuf.
>
> When fast release of mbufs is enabled for a TX queue, cache the mbuf
> mempool pointer in the TX queue structure.
> This saves one mbuf load operation for each burst of fast released TX
> mbufs.
>
> The txep->mbuf pointer is not used after the mbuf has been freed, so do
> not reset the pointer.
> This saves a txep store operation for each TX mbuf freed.
>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> drivers/net/intel/common/tx.h | 5 +++
> .../i40e/i40e_recycle_mbufs_vec_common.c | 4 +-
> drivers/net/intel/i40e/i40e_rxtx.c | 39 ++++++++++---------
> 3 files changed, 28 insertions(+), 20 deletions(-)
>
Thanks, Morten. This optimization probably applies other places in our
drivers too. I'll look at this patch - and where else it can apply - for
25.11, since we are nearing the end of the 25.07 release cycle.
/Bruce
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH] net/i40e: Fast release optimizations
2025-06-25 10:47 ` Bruce Richardson
@ 2025-06-25 11:15 ` Morten Brørup
0 siblings, 0 replies; 6+ messages in thread
From: Morten Brørup @ 2025-06-25 11:15 UTC (permalink / raw)
To: Bruce Richardson; +Cc: dev
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Wednesday, 25 June 2025 12.48
>
> On Tue, Jun 24, 2025 at 06:12:38AM +0000, Morten Brørup wrote:
> > When fast releasing mbufs, the mbufs are not accessed, so do not
> prefetch
> > them.
> > This saves a mbuf load operation for each fast released TX mbuf.
> >
> > When fast release of mbufs is enabled for a TX queue, cache the mbuf
> > mempool pointer in the TX queue structure.
> > This saves one mbuf load operation for each burst of fast released TX
> > mbufs.
> >
> > The txep->mbuf pointer is not used after the mbuf has been freed, so
> do
> > not reset the pointer.
> > This saves a txep store operation for each TX mbuf freed.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> > drivers/net/intel/common/tx.h | 5 +++
> > .../i40e/i40e_recycle_mbufs_vec_common.c | 4 +-
> > drivers/net/intel/i40e/i40e_rxtx.c | 39 ++++++++++--------
> -
> > 3 files changed, 28 insertions(+), 20 deletions(-)
> >
> Thanks, Morten. This optimization probably applies other places in our
> drivers too. I'll look at this patch - and where else it can apply - for
> 25.11, since we are nearing the end of the 25.07 release cycle.
Agreed. Also about waiting until 25.11.
Note when reviewing...
I also considered replacing:
if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
with:
if (txq->fast_free_mp != NULL)
However, I assume txq->offloads is hot in the cache, while txq->fast_free_mp is only relevant if using fast free, and thus not as hot.
So I chose not to.
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH] net/i40e: Fast release optimizations
2025-06-24 6:12 [PATCH] net/i40e: Fast release optimizations Morten Brørup
2025-06-25 10:47 ` Bruce Richardson
@ 2025-06-30 11:40 ` Konstantin Ananyev
2025-06-30 13:45 ` Morten Brørup
1 sibling, 1 reply; 6+ messages in thread
From: Konstantin Ananyev @ 2025-06-30 11:40 UTC (permalink / raw)
To: Morten Brørup, Bruce Richardson, dev
> When fast releasing mbufs, the mbufs are not accessed, so do not prefetch
> them.
> This saves a mbuf load operation for each fast released TX mbuf.
>
> When fast release of mbufs is enabled for a TX queue, cache the mbuf
> mempool pointer in the TX queue structure.
> This saves one mbuf load operation for each burst of fast released TX
> mbufs.
>
> The txep->mbuf pointer is not used after the mbuf has been freed, so do
> not reset the pointer.
> This saves a txep store operation for each TX mbuf freed.
>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> drivers/net/intel/common/tx.h | 5 +++
> .../i40e/i40e_recycle_mbufs_vec_common.c | 4 +-
> drivers/net/intel/i40e/i40e_rxtx.c | 39 ++++++++++---------
> 3 files changed, 28 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/intel/common/tx.h b/drivers/net/intel/common/tx.h
> index b0a68bae44..54c9b845f7 100644
> --- a/drivers/net/intel/common/tx.h
> +++ b/drivers/net/intel/common/tx.h
> @@ -62,6 +62,11 @@ struct ci_tx_queue {
> uint16_t tx_next_dd;
> uint16_t tx_next_rs;
> uint64_t offloads;
> + /* Mempool pointer for fast release of mbufs.
> + * NULL if disabled, UINTPTR_MAX if enabled and not yet known.
> + * Initialized at first use.
> + */
> + struct rte_mempool *fast_free_mp;
> uint64_t mbuf_errors;
> rte_iova_t tx_ring_dma; /* TX ring DMA address */
> bool tx_deferred_start; /* don't start this queue in dev start */
> diff --git a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> index 2875c578af..a46605cee9 100644
> --- a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> +++ b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> @@ -106,7 +106,9 @@ i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue,
> if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> /* Avoid txq contains buffers from unexpected mempool. */
> if (unlikely(recycle_rxq_info->mp
> - != txep[0].mbuf->pool))
> + != (likely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
> + txq->fast_free_mp :
> + (txq->fast_free_mp = txep[0].mbuf->pool))))
> return 0;
>
> /* Directly put mbufs from Tx to Rx. */
> diff --git a/drivers/net/intel/i40e/i40e_rxtx.c b/drivers/net/intel/i40e/i40e_rxtx.c
> index c3ff2e05c3..679c1340b8 100644
> --- a/drivers/net/intel/i40e/i40e_rxtx.c
> +++ b/drivers/net/intel/i40e/i40e_rxtx.c
> @@ -1332,7 +1332,7 @@ static __rte_always_inline int
> i40e_tx_free_bufs(struct ci_tx_queue *txq)
> {
> struct ci_tx_entry *txep;
> - uint16_t tx_rs_thresh = txq->tx_rs_thresh;
> + const uint16_t tx_rs_thresh = txq->tx_rs_thresh;
> uint16_t i = 0, j = 0;
> struct rte_mbuf *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
> const uint16_t k = RTE_ALIGN_FLOOR(tx_rs_thresh, RTE_I40E_TX_MAX_FREE_BUF_SZ);
> @@ -1345,41 +1345,40 @@ i40e_tx_free_bufs(struct ci_tx_queue *txq)
>
> txep = &txq->sw_ring[txq->tx_next_dd - (tx_rs_thresh - 1)];
>
> - for (i = 0; i < tx_rs_thresh; i++)
> - rte_prefetch0((txep + i)->mbuf);
> -
> if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> + struct rte_mempool * const fast_free_mp =
> + likely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
> + txq->fast_free_mp :
> + (txq->fast_free_mp = txep[0].mbuf->pool);
> +
Nit idea.
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
Just as a suggestion for further improvement:
can we update (& check) txq->fast_free_mp not at tx_free_bufs() time,
but when we fill txep[] and filling txd[] based on mbuf values?
In theory it should allow to remove the check above.
Also, again in theory, it opens opportunity (with some extra effort) to use similar optimization rte_mempool_put_bulk)
even for cases when RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE is not set.
> if (k) {
> for (j = 0; j != k; j += RTE_I40E_TX_MAX_FREE_BUF_SZ) {
> - for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ; ++i, ++txep) {
> + for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ; ++i, ++txep)
> free[i] = txep->mbuf;
> - txep->mbuf = NULL;
> - }
> - rte_mempool_put_bulk(free[0]->pool, (void **)free,
> + rte_mempool_put_bulk(fast_free_mp, (void **)free,
> RTE_I40E_TX_MAX_FREE_BUF_SZ);
> }
> }
>
> if (m) {
> - for (i = 0; i < m; ++i, ++txep) {
> + for (i = 0; i < m; ++i, ++txep)
> free[i] = txep->mbuf;
> - txep->mbuf = NULL;
> - }
> - rte_mempool_put_bulk(free[0]->pool, (void **)free, m);
> + rte_mempool_put_bulk(fast_free_mp, (void **)free, m);
> }
> } else {
> - for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
> + for (i = 0; i < tx_rs_thresh; i++)
> + rte_prefetch0((txep + i)->mbuf);
> +
> + for (i = 0; i < tx_rs_thresh; ++i, ++txep)
> rte_pktmbuf_free_seg(txep->mbuf);
> - txep->mbuf = NULL;
> - }
> }
>
> - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + tx_rs_thresh);
> if (txq->tx_next_dd >= txq->nb_tx_desc)
> - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> + txq->tx_next_dd = (uint16_t)(tx_rs_thresh - 1);
>
> - return txq->tx_rs_thresh;
> + return tx_rs_thresh;
> }
>
> /* Populate 4 descriptors with data from 4 mbufs */
> @@ -2546,6 +2545,8 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev *dev,
> txq->reg_idx = reg_idx;
> txq->port_id = dev->data->port_id;
> txq->offloads = offloads;
> + txq->fast_free_mp = offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE ?
> + (void *)UINTPTR_MAX : NULL;
> txq->i40e_vsi = vsi;
> txq->tx_deferred_start = tx_conf->tx_deferred_start;
>
> --
> 2.43.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH] net/i40e: Fast release optimizations
2025-06-30 11:40 ` Konstantin Ananyev
@ 2025-06-30 13:45 ` Morten Brørup
2025-06-30 16:06 ` Morten Brørup
0 siblings, 1 reply; 6+ messages in thread
From: Morten Brørup @ 2025-06-30 13:45 UTC (permalink / raw)
To: Konstantin Ananyev, Bruce Richardson, dev
> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Monday, 30 June 2025 13.41
>
> > When fast releasing mbufs, the mbufs are not accessed, so do not prefetch
> > them.
> > This saves a mbuf load operation for each fast released TX mbuf.
> >
> > When fast release of mbufs is enabled for a TX queue, cache the mbuf
> > mempool pointer in the TX queue structure.
> > This saves one mbuf load operation for each burst of fast released TX
> > mbufs.
> >
> > The txep->mbuf pointer is not used after the mbuf has been freed, so do
> > not reset the pointer.
> > This saves a txep store operation for each TX mbuf freed.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> > drivers/net/intel/common/tx.h | 5 +++
> > .../i40e/i40e_recycle_mbufs_vec_common.c | 4 +-
> > drivers/net/intel/i40e/i40e_rxtx.c | 39 ++++++++++---------
> > 3 files changed, 28 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/net/intel/common/tx.h b/drivers/net/intel/common/tx.h
> > index b0a68bae44..54c9b845f7 100644
> > --- a/drivers/net/intel/common/tx.h
> > +++ b/drivers/net/intel/common/tx.h
> > @@ -62,6 +62,11 @@ struct ci_tx_queue {
> > uint16_t tx_next_dd;
> > uint16_t tx_next_rs;
> > uint64_t offloads;
> > + /* Mempool pointer for fast release of mbufs.
> > + * NULL if disabled, UINTPTR_MAX if enabled and not yet known.
> > + * Initialized at first use.
> > + */
> > + struct rte_mempool *fast_free_mp;
> > uint64_t mbuf_errors;
> > rte_iova_t tx_ring_dma; /* TX ring DMA address */
> > bool tx_deferred_start; /* don't start this queue in dev start */
> > diff --git a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > index 2875c578af..a46605cee9 100644
> > --- a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > +++ b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > @@ -106,7 +106,9 @@ i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue,
> > if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> > /* Avoid txq contains buffers from unexpected mempool. */
> > if (unlikely(recycle_rxq_info->mp
> > - != txep[0].mbuf->pool))
> > + != (likely(txq->fast_free_mp != (void *)UINTPTR_MAX)
> ?
> > + txq->fast_free_mp :
> > + (txq->fast_free_mp = txep[0].mbuf->pool))))
> > return 0;
> >
> > /* Directly put mbufs from Tx to Rx. */
> > diff --git a/drivers/net/intel/i40e/i40e_rxtx.c
> b/drivers/net/intel/i40e/i40e_rxtx.c
> > index c3ff2e05c3..679c1340b8 100644
> > --- a/drivers/net/intel/i40e/i40e_rxtx.c
> > +++ b/drivers/net/intel/i40e/i40e_rxtx.c
> > @@ -1332,7 +1332,7 @@ static __rte_always_inline int
> > i40e_tx_free_bufs(struct ci_tx_queue *txq)
> > {
> > struct ci_tx_entry *txep;
> > - uint16_t tx_rs_thresh = txq->tx_rs_thresh;
> > + const uint16_t tx_rs_thresh = txq->tx_rs_thresh;
> > uint16_t i = 0, j = 0;
> > struct rte_mbuf *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
> > const uint16_t k = RTE_ALIGN_FLOOR(tx_rs_thresh,
> RTE_I40E_TX_MAX_FREE_BUF_SZ);
> > @@ -1345,41 +1345,40 @@ i40e_tx_free_bufs(struct ci_tx_queue *txq)
> >
> > txep = &txq->sw_ring[txq->tx_next_dd - (tx_rs_thresh - 1)];
> >
> > - for (i = 0; i < tx_rs_thresh; i++)
> > - rte_prefetch0((txep + i)->mbuf);
> > -
> > if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> > + struct rte_mempool * const fast_free_mp =
> > + likely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
> > + txq->fast_free_mp :
> > + (txq->fast_free_mp = txep[0].mbuf->pool);
> > +
>
> Nit idea.
> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
>
> Just as a suggestion for further improvement:
> can we update (& check) txq->fast_free_mp not at tx_free_bufs() time,
> but when we fill txep[] and filling txd[] based on mbuf values?
> In theory it should allow to remove the check above.
> Also, again in theory, it opens opportunity (with some extra effort) to use
> similar optimization rte_mempool_put_bulk)
> even for cases when RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE is not set.
Checking the TX mbufs to determine if rte_mempool_put_bulk() can be used even when the RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE offload flag is not set...
That would require that the tx_burst() function checks all the TX mbufs for the RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE requirements to set the txq->fast_free_mp pointer.
And if the requirements are not met, it must never set the txq->fast_free_mp pointer again. Otherwise, some previously transmitted mbufs, waiting to be freed, might not meet the RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE requirements.
Relying on the explicit RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE offload flag (here or elsewhere) only requires one check (of the offload flag and/or the mempool pointer (whichever is hotter in cache)) per burst of packets.
PS: The drivers really should be using the new rte_mbuf_raw_free_bulk() instead of rte_mempool_put_bulk(), so the freed mbufs are sanity checked in RTE_LIBRTE_MBUF_DEBUG mode. But such changes belong in another patch series.
>
>
> > if (k) {
> > for (j = 0; j != k; j += RTE_I40E_TX_MAX_FREE_BUF_SZ) {
> > - for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ; ++i,
> ++txep) {
> > + for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ; ++i,
> ++txep)
> > free[i] = txep->mbuf;
> > - txep->mbuf = NULL;
> > - }
> > - rte_mempool_put_bulk(free[0]->pool, (void **)free,
> > + rte_mempool_put_bulk(fast_free_mp, (void **)free,
> > RTE_I40E_TX_MAX_FREE_BUF_SZ);
> > }
> > }
> >
> > if (m) {
> > - for (i = 0; i < m; ++i, ++txep) {
> > + for (i = 0; i < m; ++i, ++txep)
> > free[i] = txep->mbuf;
> > - txep->mbuf = NULL;
> > - }
> > - rte_mempool_put_bulk(free[0]->pool, (void **)free, m);
> > + rte_mempool_put_bulk(fast_free_mp, (void **)free, m);
> > }
> > } else {
> > - for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
> > + for (i = 0; i < tx_rs_thresh; i++)
> > + rte_prefetch0((txep + i)->mbuf);
> > +
> > + for (i = 0; i < tx_rs_thresh; ++i, ++txep)
> > rte_pktmbuf_free_seg(txep->mbuf);
> > - txep->mbuf = NULL;
> > - }
> > }
> >
> > - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> > - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + tx_rs_thresh);
> > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + tx_rs_thresh);
> > if (txq->tx_next_dd >= txq->nb_tx_desc)
> > - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> > + txq->tx_next_dd = (uint16_t)(tx_rs_thresh - 1);
> >
> > - return txq->tx_rs_thresh;
> > + return tx_rs_thresh;
> > }
> >
> > /* Populate 4 descriptors with data from 4 mbufs */
> > @@ -2546,6 +2545,8 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > txq->reg_idx = reg_idx;
> > txq->port_id = dev->data->port_id;
> > txq->offloads = offloads;
> > + txq->fast_free_mp = offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE ?
> > + (void *)UINTPTR_MAX : NULL;
> > txq->i40e_vsi = vsi;
> > txq->tx_deferred_start = tx_conf->tx_deferred_start;
> >
> > --
> > 2.43.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH] net/i40e: Fast release optimizations
2025-06-30 13:45 ` Morten Brørup
@ 2025-06-30 16:06 ` Morten Brørup
0 siblings, 0 replies; 6+ messages in thread
From: Morten Brørup @ 2025-06-30 16:06 UTC (permalink / raw)
To: Konstantin Ananyev, Bruce Richardson, dev
> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Monday, 30 June 2025 15.46
>
> > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > Sent: Monday, 30 June 2025 13.41
> >
> > > When fast releasing mbufs, the mbufs are not accessed, so do not
> prefetch
> > > them.
> > > This saves a mbuf load operation for each fast released TX mbuf.
> > >
> > > When fast release of mbufs is enabled for a TX queue, cache the mbuf
> > > mempool pointer in the TX queue structure.
> > > This saves one mbuf load operation for each burst of fast released
> TX
> > > mbufs.
> > >
> > > The txep->mbuf pointer is not used after the mbuf has been freed, so
> do
> > > not reset the pointer.
> > > This saves a txep store operation for each TX mbuf freed.
> > >
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> > > drivers/net/intel/common/tx.h | 5 +++
> > > .../i40e/i40e_recycle_mbufs_vec_common.c | 4 +-
> > > drivers/net/intel/i40e/i40e_rxtx.c | 39 ++++++++++------
> ---
> > > 3 files changed, 28 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/drivers/net/intel/common/tx.h
> b/drivers/net/intel/common/tx.h
> > > index b0a68bae44..54c9b845f7 100644
> > > --- a/drivers/net/intel/common/tx.h
> > > +++ b/drivers/net/intel/common/tx.h
> > > @@ -62,6 +62,11 @@ struct ci_tx_queue {
> > > uint16_t tx_next_dd;
> > > uint16_t tx_next_rs;
> > > uint64_t offloads;
> > > + /* Mempool pointer for fast release of mbufs.
> > > + * NULL if disabled, UINTPTR_MAX if enabled and not yet known.
> > > + * Initialized at first use.
> > > + */
> > > + struct rte_mempool *fast_free_mp;
> > > uint64_t mbuf_errors;
> > > rte_iova_t tx_ring_dma; /* TX ring DMA address */
> > > bool tx_deferred_start; /* don't start this queue in dev start */
> > > diff --git a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > > index 2875c578af..a46605cee9 100644
> > > --- a/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > > +++ b/drivers/net/intel/i40e/i40e_recycle_mbufs_vec_common.c
> > > @@ -106,7 +106,9 @@ i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue,
> > > if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> > > /* Avoid txq contains buffers from unexpected mempool. */
> > > if (unlikely(recycle_rxq_info->mp
> > > - != txep[0].mbuf->pool))
> > > + != (likely(txq->fast_free_mp != (void
> *)UINTPTR_MAX)
> > ?
> > > + txq->fast_free_mp :
> > > + (txq->fast_free_mp = txep[0].mbuf->pool))))
> > > return 0;
> > >
> > > /* Directly put mbufs from Tx to Rx. */
> > > diff --git a/drivers/net/intel/i40e/i40e_rxtx.c
> > b/drivers/net/intel/i40e/i40e_rxtx.c
> > > index c3ff2e05c3..679c1340b8 100644
> > > --- a/drivers/net/intel/i40e/i40e_rxtx.c
> > > +++ b/drivers/net/intel/i40e/i40e_rxtx.c
> > > @@ -1332,7 +1332,7 @@ static __rte_always_inline int
> > > i40e_tx_free_bufs(struct ci_tx_queue *txq)
> > > {
> > > struct ci_tx_entry *txep;
> > > - uint16_t tx_rs_thresh = txq->tx_rs_thresh;
> > > + const uint16_t tx_rs_thresh = txq->tx_rs_thresh;
> > > uint16_t i = 0, j = 0;
> > > struct rte_mbuf *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];
> > > const uint16_t k = RTE_ALIGN_FLOOR(tx_rs_thresh,
> > RTE_I40E_TX_MAX_FREE_BUF_SZ);
> > > @@ -1345,41 +1345,40 @@ i40e_tx_free_bufs(struct ci_tx_queue *txq)
> > >
> > > txep = &txq->sw_ring[txq->tx_next_dd - (tx_rs_thresh - 1)];
> > >
> > > - for (i = 0; i < tx_rs_thresh; i++)
> > > - rte_prefetch0((txep + i)->mbuf);
> > > -
> > > if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> > > + struct rte_mempool * const fast_free_mp =
> > > + likely(txq->fast_free_mp != (void
> *)UINTPTR_MAX) ?
> > > + txq->fast_free_mp :
> > > + (txq->fast_free_mp = txep[0].mbuf->pool);
> > > +
Speaking about optimizations, these once-in-a-lifetime initializations of txq->fast_free_mp are the perfect candidates for a new superlikely() macro in <rte_branch_prediction.h>, which BTW is not only about branch prediction, but also about letting the compiler optimize the likely code path, e.g. by moving unlikely code away from it, thereby reducing the instruction cache pressure:
#define superlikely(x) __builtin_expect_with_probability(!!(x), 1, 1.0)
#define superunlikely(x) __builtin_expect_with_probability(!!(x), 0, 1.0)
if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+ struct rte_mempool * const fast_free_mp =
+ superlikely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
+ txq->fast_free_mp :
+ (txq->fast_free_mp = txep[0].mbuf->pool);
> >
> > Nit idea.
> > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> >
> > Just as a suggestion for further improvement:
> > can we update (& check) txq->fast_free_mp not at tx_free_bufs() time,
> > but when we fill txep[] and filling txd[] based on mbuf values?
> > In theory it should allow to remove the check above.
> > Also, again in theory, it opens opportunity (with some extra effort)
> to use
> > similar optimization rte_mempool_put_bulk)
> > even for cases when RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE is not set.
>
> Checking the TX mbufs to determine if rte_mempool_put_bulk() can be used
> even when the RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE offload flag is not
> set...
>
> That would require that the tx_burst() function checks all the TX mbufs
> for the RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE requirements to set the txq-
> >fast_free_mp pointer.
> And if the requirements are not met, it must never set the txq-
> >fast_free_mp pointer again. Otherwise, some previously transmitted
> mbufs, waiting to be freed, might not meet the
> RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE requirements.
>
> Relying on the explicit RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE offload flag
> (here or elsewhere) only requires one check (of the offload flag and/or
> the mempool pointer (whichever is hotter in cache)) per burst of
> packets.
>
> PS: The drivers really should be using the new rte_mbuf_raw_free_bulk()
> instead of rte_mempool_put_bulk(), so the freed mbufs are sanity checked
> in RTE_LIBRTE_MBUF_DEBUG mode. But such changes belong in another patch
> series.
>
> >
> >
> > > if (k) {
> > > for (j = 0; j != k; j += RTE_I40E_TX_MAX_FREE_BUF_SZ)
> {
> > > - for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ;
> ++i,
> > ++txep) {
> > > + for (i = 0; i < RTE_I40E_TX_MAX_FREE_BUF_SZ;
> ++i,
> > ++txep)
> > > free[i] = txep->mbuf;
> > > - txep->mbuf = NULL;
> > > - }
> > > - rte_mempool_put_bulk(free[0]->pool, (void
> **)free,
> > > + rte_mempool_put_bulk(fast_free_mp, (void
> **)free,
> > > RTE_I40E_TX_MAX_FREE_BUF_SZ);
> > > }
> > > }
> > >
> > > if (m) {
> > > - for (i = 0; i < m; ++i, ++txep) {
> > > + for (i = 0; i < m; ++i, ++txep)
> > > free[i] = txep->mbuf;
> > > - txep->mbuf = NULL;
> > > - }
> > > - rte_mempool_put_bulk(free[0]->pool, (void **)free,
> m);
> > > + rte_mempool_put_bulk(fast_free_mp, (void **)free, m);
> > > }
> > > } else {
> > > - for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
> > > + for (i = 0; i < tx_rs_thresh; i++)
> > > + rte_prefetch0((txep + i)->mbuf);
> > > +
> > > + for (i = 0; i < tx_rs_thresh; ++i, ++txep)
> > > rte_pktmbuf_free_seg(txep->mbuf);
> > > - txep->mbuf = NULL;
> > > - }
> > > }
> > >
> > > - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> > > - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> > > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + tx_rs_thresh);
> > > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + tx_rs_thresh);
> > > if (txq->tx_next_dd >= txq->nb_tx_desc)
> > > - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> > > + txq->tx_next_dd = (uint16_t)(tx_rs_thresh - 1);
> > >
> > > - return txq->tx_rs_thresh;
> > > + return tx_rs_thresh;
> > > }
> > >
> > > /* Populate 4 descriptors with data from 4 mbufs */
> > > @@ -2546,6 +2545,8 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev
> *dev,
> > > txq->reg_idx = reg_idx;
> > > txq->port_id = dev->data->port_id;
> > > txq->offloads = offloads;
> > > + txq->fast_free_mp = offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE ?
> > > + (void *)UINTPTR_MAX : NULL;
> > > txq->i40e_vsi = vsi;
> > > txq->tx_deferred_start = tx_conf->tx_deferred_start;
> > >
> > > --
> > > 2.43.0
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-06-30 16:06 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-06-24 6:12 [PATCH] net/i40e: Fast release optimizations Morten Brørup
2025-06-25 10:47 ` Bruce Richardson
2025-06-25 11:15 ` Morten Brørup
2025-06-30 11:40 ` Konstantin Ananyev
2025-06-30 13:45 ` Morten Brørup
2025-06-30 16:06 ` Morten Brørup
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).