DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] mbuf: optimize segment prefree
@ 2025-08-27 21:35 Morten Brørup
  2025-08-27 23:17 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Morten Brørup @ 2025-08-27 21:35 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

Eefactored rte_pktmbuf_prefree_seg() for both performance and readability.

With the optimized RTE_MBUF_DIRECT() macro, the common likely code path
now fits within one instruction cache line on x86-64 when built with GCC.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mbuf/rte_mbuf.h      | 52 ++++++++++++++++------------------------
 lib/mbuf/rte_mbuf_core.h |  8 +++++++
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
index 06ab7502a5..f4a348597f 100644
--- a/lib/mbuf/rte_mbuf.h
+++ b/lib/mbuf/rte_mbuf.h
@@ -31,6 +31,7 @@
  * http://www.kohala.com/start/tcpipiv2.html
  */
 
+#include <stdbool.h>
 #include <stdint.h>
 
 #include <rte_common.h>
@@ -1423,44 +1424,31 @@ static inline int __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m)
 static __rte_always_inline struct rte_mbuf *
 rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 {
-	__rte_mbuf_sanity_check(m, 0);
-
-	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
-
-		if (!RTE_MBUF_DIRECT(m)) {
-			rte_pktmbuf_detach(m);
-			if (RTE_MBUF_HAS_EXTBUF(m) &&
-			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
-			    __rte_pktmbuf_pinned_extbuf_decref(m))
-				return NULL;
-		}
-
-		if (m->next != NULL)
-			m->next = NULL;
-		if (m->nb_segs != 1)
-			m->nb_segs = 1;
+	bool refcnt_not_one;
 
-		return m;
+	__rte_mbuf_sanity_check(m, 0);
 
-	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
+	refcnt_not_one = unlikely(rte_mbuf_refcnt_read(m) != 1);
+	if (refcnt_not_one &&
+		__rte_mbuf_refcnt_update(m, -1) != 0)
+		return NULL;
 
-		if (!RTE_MBUF_DIRECT(m)) {
-			rte_pktmbuf_detach(m);
-			if (RTE_MBUF_HAS_EXTBUF(m) &&
-			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
-			    __rte_pktmbuf_pinned_extbuf_decref(m))
-				return NULL;
-		}
+	if (unlikely(!RTE_MBUF_DIRECT(m))) {
+		rte_pktmbuf_detach(m);
+		if (RTE_MBUF_HAS_EXTBUF(m) &&
+			RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
+			__rte_pktmbuf_pinned_extbuf_decref(m))
+			return NULL;
+	}
 
-		if (m->next != NULL)
-			m->next = NULL;
-		if (m->nb_segs != 1)
-			m->nb_segs = 1;
+	if (refcnt_not_one)
 		rte_mbuf_refcnt_set(m, 1);
+	if (m->nb_segs != 1)
+		m->nb_segs = 1;
+	if (m->next != NULL)
+		m->next = NULL;
 
-		return m;
-	}
-	return NULL;
+	return m;
 }
 
 /**
diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
index a0df265b5d..a5242274d7 100644
--- a/lib/mbuf/rte_mbuf_core.h
+++ b/lib/mbuf/rte_mbuf_core.h
@@ -715,6 +715,14 @@ struct rte_mbuf_ext_shared_info {
 #define RTE_MBUF_DIRECT(mb) \
 	(!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL)))
 
+/* GCC only optimizes single-bit MSB tests this way, so do it by hand with multi-bit. */
+#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
+#undef RTE_MBUF_DIRECT
+#define RTE_MBUF_DIRECT(mb) \
+	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) + 7] & \
+	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> 56)))
+#endif
+
 /** Uninitialized or unspecified port. */
 #define RTE_MBUF_PORT_INVALID UINT16_MAX
 /** For backwards compatibility. */
-- 
2.43.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mbuf: optimize segment prefree
  2025-08-27 21:35 [PATCH] mbuf: optimize segment prefree Morten Brørup
@ 2025-08-27 23:17 ` Stephen Hemminger
  2025-10-06 17:46   ` Wathsala Vithanage
  2025-10-06 14:49 ` Morten Brørup
  2025-10-20 12:02 ` [PATCH v2] " Morten Brørup
  2 siblings, 1 reply; 7+ messages in thread
From: Stephen Hemminger @ 2025-08-27 23:17 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Wed, 27 Aug 2025 21:35:34 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

>  
> +/* GCC only optimizes single-bit MSB tests this way, so do it by hand with multi-bit. */
> +#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
> +#undef RTE_MBUF_DIRECT
> +#define RTE_MBUF_DIRECT(mb) \
> +	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) + 7] & \
> +	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> 56)))
> +#endif

Complex enough expression that I would prefer this be an inline function
with some more temporary variables and more comments.
Like the magic 7 for mask??

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] mbuf: optimize segment prefree
  2025-08-27 21:35 [PATCH] mbuf: optimize segment prefree Morten Brørup
  2025-08-27 23:17 ` Stephen Hemminger
@ 2025-10-06 14:49 ` Morten Brørup
  2025-10-20 12:02 ` [PATCH v2] " Morten Brørup
  2 siblings, 0 replies; 7+ messages in thread
From: Morten Brørup @ 2025-10-06 14:49 UTC (permalink / raw)
  To: dev

PING for review.

Only received feedback from Stephen about the GCC optimized RTE_MBUF_DIRECT() macro being complex, and could be an inline function instead.

Venlig hilsen / Kind regards,
-Morten Brørup


> -----Original Message-----
> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Wednesday, 27 August 2025 23.36
> To: dev@dpdk.org
> Cc: Morten Brørup
> Subject: [PATCH] mbuf: optimize segment prefree
> 
> Eefactored rte_pktmbuf_prefree_seg() for both performance and
> readability.
> 
> With the optimized RTE_MBUF_DIRECT() macro, the common likely code path
> now fits within one instruction cache line on x86-64 when built with
> GCC.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  lib/mbuf/rte_mbuf.h      | 52 ++++++++++++++++------------------------
>  lib/mbuf/rte_mbuf_core.h |  8 +++++++
>  2 files changed, 28 insertions(+), 32 deletions(-)
> 
> diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
> index 06ab7502a5..f4a348597f 100644
> --- a/lib/mbuf/rte_mbuf.h
> +++ b/lib/mbuf/rte_mbuf.h
> @@ -31,6 +31,7 @@
>   * http://www.kohala.com/start/tcpipiv2.html
>   */
> 
> +#include <stdbool.h>
>  #include <stdint.h>
> 
>  #include <rte_common.h>
> @@ -1423,44 +1424,31 @@ static inline int
> __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m)
>  static __rte_always_inline struct rte_mbuf *
>  rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
>  {
> -	__rte_mbuf_sanity_check(m, 0);
> -
> -	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
> -
> -		if (!RTE_MBUF_DIRECT(m)) {
> -			rte_pktmbuf_detach(m);
> -			if (RTE_MBUF_HAS_EXTBUF(m) &&
> -			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
> -			    __rte_pktmbuf_pinned_extbuf_decref(m))
> -				return NULL;
> -		}
> -
> -		if (m->next != NULL)
> -			m->next = NULL;
> -		if (m->nb_segs != 1)
> -			m->nb_segs = 1;
> +	bool refcnt_not_one;
> 
> -		return m;
> +	__rte_mbuf_sanity_check(m, 0);
> 
> -	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
> +	refcnt_not_one = unlikely(rte_mbuf_refcnt_read(m) != 1);
> +	if (refcnt_not_one &&
> +		__rte_mbuf_refcnt_update(m, -1) != 0)
> +		return NULL;
> 
> -		if (!RTE_MBUF_DIRECT(m)) {
> -			rte_pktmbuf_detach(m);
> -			if (RTE_MBUF_HAS_EXTBUF(m) &&
> -			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
> -			    __rte_pktmbuf_pinned_extbuf_decref(m))
> -				return NULL;
> -		}
> +	if (unlikely(!RTE_MBUF_DIRECT(m))) {
> +		rte_pktmbuf_detach(m);
> +		if (RTE_MBUF_HAS_EXTBUF(m) &&
> +			RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
> +			__rte_pktmbuf_pinned_extbuf_decref(m))
> +			return NULL;
> +	}
> 
> -		if (m->next != NULL)
> -			m->next = NULL;
> -		if (m->nb_segs != 1)
> -			m->nb_segs = 1;
> +	if (refcnt_not_one)
>  		rte_mbuf_refcnt_set(m, 1);
> +	if (m->nb_segs != 1)
> +		m->nb_segs = 1;
> +	if (m->next != NULL)
> +		m->next = NULL;
> 
> -		return m;
> -	}
> -	return NULL;
> +	return m;
>  }
> 
>  /**
> diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> index a0df265b5d..a5242274d7 100644
> --- a/lib/mbuf/rte_mbuf_core.h
> +++ b/lib/mbuf/rte_mbuf_core.h
> @@ -715,6 +715,14 @@ struct rte_mbuf_ext_shared_info {
>  #define RTE_MBUF_DIRECT(mb) \
>  	(!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL)))
> 
> +/* GCC only optimizes single-bit MSB tests this way, so do it by hand
> with multi-bit. */
> +#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
> +#undef RTE_MBUF_DIRECT
> +#define RTE_MBUF_DIRECT(mb) \
> +	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) +
> 7] & \
> +	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> 56)))
> +#endif
> +
>  /** Uninitialized or unspecified port. */
>  #define RTE_MBUF_PORT_INVALID UINT16_MAX
>  /** For backwards compatibility. */
> --
> 2.43.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mbuf: optimize segment prefree
  2025-08-27 23:17 ` Stephen Hemminger
@ 2025-10-06 17:46   ` Wathsala Vithanage
  2025-10-06 18:26     ` Morten Brørup
  0 siblings, 1 reply; 7+ messages in thread
From: Wathsala Vithanage @ 2025-10-06 17:46 UTC (permalink / raw)
  To: dev


On 8/27/25 18:17, Stephen Hemminger wrote:
> On Wed, 27 Aug 2025 21:35:34 +0000
> Morten Brørup <mb@smartsharesystems.com> wrote:
>
>>   
>> +/* GCC only optimizes single-bit MSB tests this way, so do it by hand with multi-bit. */
>> +#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
>> +#undef RTE_MBUF_DIRECT
>> +#define RTE_MBUF_DIRECT(mb) \
>> +	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) + 7] & \
>> +	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> 56)))
>> +#endif
> Complex enough expression that I would prefer this be an inline function
> with some more temporary variables and more comments.
> Like the magic 7 for mask??

+1

--wathsala


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] mbuf: optimize segment prefree
  2025-10-06 17:46   ` Wathsala Vithanage
@ 2025-10-06 18:26     ` Morten Brørup
  0 siblings, 0 replies; 7+ messages in thread
From: Morten Brørup @ 2025-10-06 18:26 UTC (permalink / raw)
  To: Wathsala Vithanage, stephen; +Cc: dev

> From: Wathsala Vithanage [mailto:wathsala.vithanage@arm.com]
> Sent: Monday, 6 October 2025 19.46
> 
> On 8/27/25 18:17, Stephen Hemminger wrote:
> > On Wed, 27 Aug 2025 21:35:34 +0000
> > Morten Brørup <mb@smartsharesystems.com> wrote:
> >
> >>
> >> +/* GCC only optimizes single-bit MSB tests this way, so do it by
> hand with multi-bit. */
> >> +#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
> >> +#undef RTE_MBUF_DIRECT
> >> +#define RTE_MBUF_DIRECT(mb) \
> >> +	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) +
> 7] & \
> >> +	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> 56)))
> >> +#endif
> > Complex enough expression that I would prefer this be an inline
> function
> > with some more temporary variables and more comments.
> > Like the magic 7 for mask??
> 
> +1

So, instead of overriding the macro definition in the GCC exception case, you prefer something like:
#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
static __rte_always_inline
RTE_MBUF_DIRECT(const struct rte_mbuf * const mb)
{...}
#else
#define RTE_MBUF_DIRECT(mb) \
	(!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL)))
#endif

which would give us a function with a macro-like (upper case) name.

How about I just add a more detailed description to this macro?
After all, formally, it's an exception to the simple default macro.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] mbuf: optimize segment prefree
  2025-08-27 21:35 [PATCH] mbuf: optimize segment prefree Morten Brørup
  2025-08-27 23:17 ` Stephen Hemminger
  2025-10-06 14:49 ` Morten Brørup
@ 2025-10-20 12:02 ` Morten Brørup
  2025-10-20 14:24   ` Konstantin Ananyev
  2 siblings, 1 reply; 7+ messages in thread
From: Morten Brørup @ 2025-10-20 12:02 UTC (permalink / raw)
  To: dev, Stephen Hemminger, Wathsala Vithanage; +Cc: Morten Brørup

Refactored rte_pktmbuf_prefree_seg() for both performance and readability.

With the optimized RTE_MBUF_DIRECT() macro, the common likely code path
now fits within one instruction cache line on x86-64 when built with GCC.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
* Fixed typo in commit description.
* Fixed indentation.
* Added detailed description to the optimized RTE_MBUF_DIRECT() macro.
  (Stephen Hemminger)
* Added static_assert() to verify that the optimized RTE_MBUF_DIRECT()
  macro is valid, specifically that the tested bits are in the MSB of the
  64-bit field.
---
 lib/mbuf/rte_mbuf.h      | 51 +++++++++++++++-------------------------
 lib/mbuf/rte_mbuf_core.h | 27 +++++++++++++++++++++
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
index 3df22125de..2004391f57 100644
--- a/lib/mbuf/rte_mbuf.h
+++ b/lib/mbuf/rte_mbuf.h
@@ -31,6 +31,7 @@
  * http://www.kohala.com/start/tcpipiv2.html
  */
 
+#include <stdbool.h>
 #include <stdint.h>
 
 #include <rte_common.h>
@@ -1458,44 +1459,30 @@ static inline int __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m)
 static __rte_always_inline struct rte_mbuf *
 rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 {
-	__rte_mbuf_sanity_check(m, 0);
-
-	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
-
-		if (!RTE_MBUF_DIRECT(m)) {
-			rte_pktmbuf_detach(m);
-			if (RTE_MBUF_HAS_EXTBUF(m) &&
-			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
-			    __rte_pktmbuf_pinned_extbuf_decref(m))
-				return NULL;
-		}
-
-		if (m->next != NULL)
-			m->next = NULL;
-		if (m->nb_segs != 1)
-			m->nb_segs = 1;
+	bool refcnt_not_one;
 
-		return m;
+	__rte_mbuf_sanity_check(m, 0);
 
-	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
+	refcnt_not_one = unlikely(rte_mbuf_refcnt_read(m) != 1);
+	if (refcnt_not_one && __rte_mbuf_refcnt_update(m, -1) != 0)
+		return NULL;
 
-		if (!RTE_MBUF_DIRECT(m)) {
-			rte_pktmbuf_detach(m);
-			if (RTE_MBUF_HAS_EXTBUF(m) &&
-			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
-			    __rte_pktmbuf_pinned_extbuf_decref(m))
-				return NULL;
-		}
+	if (unlikely(!RTE_MBUF_DIRECT(m))) {
+		rte_pktmbuf_detach(m);
+		if (RTE_MBUF_HAS_EXTBUF(m) &&
+				RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
+				__rte_pktmbuf_pinned_extbuf_decref(m))
+			return NULL;
+	}
 
-		if (m->next != NULL)
-			m->next = NULL;
-		if (m->nb_segs != 1)
-			m->nb_segs = 1;
+	if (refcnt_not_one)
 		rte_mbuf_refcnt_set(m, 1);
+	if (m->nb_segs != 1)
+		m->nb_segs = 1;
+	if (m->next != NULL)
+		m->next = NULL;
 
-		return m;
-	}
-	return NULL;
+	return m;
 }
 
 /**
diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
index a0df265b5d..41f40e1967 100644
--- a/lib/mbuf/rte_mbuf_core.h
+++ b/lib/mbuf/rte_mbuf_core.h
@@ -715,6 +715,33 @@ struct rte_mbuf_ext_shared_info {
 #define RTE_MBUF_DIRECT(mb) \
 	(!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL)))
 
+#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
+/* Optimization for code size.
+ * GCC only optimizes single-bit MSB tests this way, so we do it by hand with multi-bit.
+ *
+ * The flags RTE_MBUF_F_INDIRECT and RTE_MBUF_F_EXTERNAL are both in the MSB of the
+ * 64-bit ol_flags field, so we only compare this one byte instead of all 64 bits.
+ * On little endian architecture, the MSB of a 64-bit integer is at byte offest 7.
+ *
+ * Note: Tested using GCC version 16.0.0 20251019 (experimental).
+ *
+ * Without this optimization, GCC generates 17 bytes of instructions:
+ *      movabs rax,0x6000000000000000       // 10 bytes
+ *      and    rax,QWORD PTR [rdi+0x18]     // 4 bytes
+ *      sete   al                           // 3 bytes
+ * With this optimization, GCC generates only 7 bytes of instructions:
+ *      test   BYTE PTR [rdi+0x1f],0x60     // 4 bytes
+ *      sete   al                           // 3 bytes
+ */
+#undef RTE_MBUF_DIRECT
+#define RTE_MBUF_DIRECT(mb) \
+	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) + 7] & \
+	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> (7 * 8))))
+static_assert(((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> (7 * 8)) << (7 * 8) ==
+	(RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL),
+	"RTE_MBUF_F_INDIRECT and/or RTE_MBUF_F_EXTERNAL are not in MSB.");
+#endif
+
 /** Uninitialized or unspecified port. */
 #define RTE_MBUF_PORT_INVALID UINT16_MAX
 /** For backwards compatibility. */
-- 
2.43.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH v2] mbuf: optimize segment prefree
  2025-10-20 12:02 ` [PATCH v2] " Morten Brørup
@ 2025-10-20 14:24   ` Konstantin Ananyev
  0 siblings, 0 replies; 7+ messages in thread
From: Konstantin Ananyev @ 2025-10-20 14:24 UTC (permalink / raw)
  To: Morten Brørup, dev, Stephen Hemminger, Wathsala Vithanage


> 
> Refactored rte_pktmbuf_prefree_seg() for both performance and readability.
> 
> With the optimized RTE_MBUF_DIRECT() macro, the common likely code path
> now fits within one instruction cache line on x86-64 when built with GCC.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v2:
> * Fixed typo in commit description.
> * Fixed indentation.
> * Added detailed description to the optimized RTE_MBUF_DIRECT() macro.
>   (Stephen Hemminger)
> * Added static_assert() to verify that the optimized RTE_MBUF_DIRECT()
>   macro is valid, specifically that the tested bits are in the MSB of the
>   64-bit field.
> ---
>  lib/mbuf/rte_mbuf.h      | 51 +++++++++++++++-------------------------
>  lib/mbuf/rte_mbuf_core.h | 27 +++++++++++++++++++++
>  2 files changed, 46 insertions(+), 32 deletions(-)
> 
> diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
> index 3df22125de..2004391f57 100644
> --- a/lib/mbuf/rte_mbuf.h
> +++ b/lib/mbuf/rte_mbuf.h
> @@ -31,6 +31,7 @@
>   * http://www.kohala.com/start/tcpipiv2.html
>   */
> 
> +#include <stdbool.h>
>  #include <stdint.h>
> 
>  #include <rte_common.h>
> @@ -1458,44 +1459,30 @@ static inline int
> __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m)
>  static __rte_always_inline struct rte_mbuf *
>  rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
>  {
> -	__rte_mbuf_sanity_check(m, 0);
> -
> -	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
> -
> -		if (!RTE_MBUF_DIRECT(m)) {
> -			rte_pktmbuf_detach(m);
> -			if (RTE_MBUF_HAS_EXTBUF(m) &&
> -			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
> -			    __rte_pktmbuf_pinned_extbuf_decref(m))
> -				return NULL;
> -		}
> -
> -		if (m->next != NULL)
> -			m->next = NULL;
> -		if (m->nb_segs != 1)
> -			m->nb_segs = 1;
> +	bool refcnt_not_one;
> 
> -		return m;
> +	__rte_mbuf_sanity_check(m, 0);
> 
> -	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
> +	refcnt_not_one = unlikely(rte_mbuf_refcnt_read(m) != 1);
> +	if (refcnt_not_one && __rte_mbuf_refcnt_update(m, -1) != 0)
> +		return NULL;
> 
> -		if (!RTE_MBUF_DIRECT(m)) {
> -			rte_pktmbuf_detach(m);
> -			if (RTE_MBUF_HAS_EXTBUF(m) &&
> -			    RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
> -			    __rte_pktmbuf_pinned_extbuf_decref(m))
> -				return NULL;
> -		}
> +	if (unlikely(!RTE_MBUF_DIRECT(m))) {
> +		rte_pktmbuf_detach(m);
> +		if (RTE_MBUF_HAS_EXTBUF(m) &&
> +				RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
> +				__rte_pktmbuf_pinned_extbuf_decref(m))
> +			return NULL;
> +	}
> 
> -		if (m->next != NULL)
> -			m->next = NULL;
> -		if (m->nb_segs != 1)
> -			m->nb_segs = 1;
> +	if (refcnt_not_one)
>  		rte_mbuf_refcnt_set(m, 1);
> +	if (m->nb_segs != 1)
> +		m->nb_segs = 1;
> +	if (m->next != NULL)
> +		m->next = NULL;
> 
> -		return m;
> -	}
> -	return NULL;
> +	return m;
>  }
> 
>  /**
> diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> index a0df265b5d..41f40e1967 100644
> --- a/lib/mbuf/rte_mbuf_core.h
> +++ b/lib/mbuf/rte_mbuf_core.h
> @@ -715,6 +715,33 @@ struct rte_mbuf_ext_shared_info {
>  #define RTE_MBUF_DIRECT(mb) \
>  	(!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT |
> RTE_MBUF_F_EXTERNAL)))
> 
> +#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
> +/* Optimization for code size.
> + * GCC only optimizes single-bit MSB tests this way, so we do it by hand with
> multi-bit.
> + *
> + * The flags RTE_MBUF_F_INDIRECT and RTE_MBUF_F_EXTERNAL are both in
> the MSB of the
> + * 64-bit ol_flags field, so we only compare this one byte instead of all 64 bits.
> + * On little endian architecture, the MSB of a 64-bit integer is at byte offest 7.
> + *
> + * Note: Tested using GCC version 16.0.0 20251019 (experimental).
> + *
> + * Without this optimization, GCC generates 17 bytes of instructions:
> + *      movabs rax,0x6000000000000000       // 10 bytes
> + *      and    rax,QWORD PTR [rdi+0x18]     // 4 bytes
> + *      sete   al                           // 3 bytes
> + * With this optimization, GCC generates only 7 bytes of instructions:
> + *      test   BYTE PTR [rdi+0x1f],0x60     // 4 bytes
> + *      sete   al                           // 3 bytes
> + */
> +#undef RTE_MBUF_DIRECT
> +#define RTE_MBUF_DIRECT(mb) \
> +	(!(((const uint8_t *)(mb))[offsetof(struct rte_mbuf, ol_flags) + 7] & \
> +	(uint8_t)((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> (7 *
> 8))))
> +static_assert(((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) >> (7 * 8))
> << (7 * 8) ==
> +	(RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL),
> +	"RTE_MBUF_F_INDIRECT and/or RTE_MBUF_F_EXTERNAL are not in
> MSB.");
> +#endif
> +
>  /** Uninitialized or unspecified port. */
>  #define RTE_MBUF_PORT_INVALID UINT16_MAX
>  /** For backwards compatibility. */
> --

Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

> 2.43.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-10-20 14:24 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-27 21:35 [PATCH] mbuf: optimize segment prefree Morten Brørup
2025-08-27 23:17 ` Stephen Hemminger
2025-10-06 17:46   ` Wathsala Vithanage
2025-10-06 18:26     ` Morten Brørup
2025-10-06 14:49 ` Morten Brørup
2025-10-20 12:02 ` [PATCH v2] " Morten Brørup
2025-10-20 14:24   ` Konstantin Ananyev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).