DPDK patches and discussions
 help / color / mirror / Atom feed
* [RFC] eal: provide option to use compiler memcpy instead of RTE
@ 2024-05-27 11:11 Mattias Rönnblom
  2024-05-28  7:43 ` [RFC v2] " Mattias Rönnblom
  2024-05-29 21:56 ` [RFC] " Stephen Hemminger
  0 siblings, 2 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-27 11:11 UTC (permalink / raw)
  To: dev; +Cc: hofors, Morten Brørup, Stephen Hemminger, Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
traditional, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. The default is
true. It's not obvious what should be the default, but compiler
memcpy() is enabled by default in this RFC so any tests run with this
patch use the new approach.

One purpose of this RFC is to make it easy to evaluate the costs and
benefits of a switch.

Only ARM and x86 is implemented.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 config/meson.build                   |  1 +
 lib/eal/arm/include/rte_memcpy.h     | 10 +++++
 lib/eal/include/generic/rte_memcpy.h | 62 ++++++++++++++++++++++++----
 lib/eal/x86/include/meson.build      |  6 ++-
 lib/eal/x86/include/rte_memcpy.h     | 11 ++++-
 meson_options.txt                    |  2 +
 6 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..f2f66f372d 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,20 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +43,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +55,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +91,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +113,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..cf851df60d 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -7,7 +7,6 @@ arch_headers = files(
         'rte_cpuflags.h',
         'rte_cycles.h',
         'rte_io.h',
-        'rte_memcpy.h',
         'rte_pause.h',
         'rte_power_intrinsics.h',
         'rte_prefetch.h',
@@ -16,6 +15,11 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
+if not get_option('use_cc_memcpy')
+        arch_headers += 'rte_memcpy.h'
+endif
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..263b0e7882 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have rte_memcpy() delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-27 11:11 [RFC] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-05-28  7:43 ` Mattias Rönnblom
  2024-05-28  8:19   ` Mattias Rönnblom
                     ` (2 more replies)
  2024-05-29 21:56 ` [RFC] " Stephen Hemminger
  1 sibling, 3 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-28  7:43 UTC (permalink / raw)
  To: dev; +Cc: hofors, Morten Brørup, Stephen Hemminger, Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
traditional, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. The default is
true. It's not obvious what should be the default, but compiler
memcpy() is enabled by default in this RFC so any tests run with this
patch use the new approach.

One purpose of this RFC is to make it easy to evaluate the costs and
benefits of a switch.

Only Loongarch, ARM and x86 is implemented. Only x86 is tested.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 config/meson.build                     |  1 +
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 62 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 52 ++-------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 7 files changed, 82 insertions(+), 57 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..f2f66f372d 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,20 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +43,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +55,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +91,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +113,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..159420d3b7 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,11 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include "rte_config.h"
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 
 #include "generic/rte_memcpy.h"
 
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
-
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..263b0e7882 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have rte_memcpy() delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  7:43 ` [RFC v2] " Mattias Rönnblom
@ 2024-05-28  8:19   ` Mattias Rönnblom
  2024-05-28  8:27     ` Bruce Richardson
  2024-05-28 14:59     ` Stephen Hemminger
  2024-05-28  8:20   ` Bruce Richardson
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2 siblings, 2 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-28  8:19 UTC (permalink / raw)
  To: Mattias Rönnblom, dev; +Cc: Morten Brørup, Stephen Hemminger

On 2024-05-28 09:43, Mattias Rönnblom wrote:
> Provide build option to have functions in <rte_memcpy.h> delegate to
> the standard compiler/libc memcpy(), instead of using the various
> traditional, handcrafted, per-architecture rte_memcpy()
> implementations.
> 
> A new meson build option 'use_cc_memcpy' is added. The default is
> true. It's not obvious what should be the default, but compiler
> memcpy() is enabled by default in this RFC so any tests run with this
> patch use the new approach.
> 
> One purpose of this RFC is to make it easy to evaluate the costs and
> benefits of a switch.
> 

I've tested this patch some with DSW micro benchmarks, and the result is 
a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.

We've also run characteristic test suite of a large, real world app. 
Here, we saw no effect. GCC 10.5.

x86_64 in both cases (Skylake and Raptor Lake).

Last time we did the same, there were a noticeable performance 
degradation in both the above cases.

This is not a lot of data points, but I think it we should consider 
making the custom RTE memcpy() implementations optional in the next 
release, and if no-one complains, remove the implementations in the next 
release.

(Whether or not [or how long] to keep the wrapper API is another question.)

<snip>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  7:43 ` [RFC v2] " Mattias Rönnblom
  2024-05-28  8:19   ` Mattias Rönnblom
@ 2024-05-28  8:20   ` Bruce Richardson
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2 siblings, 0 replies; 128+ messages in thread
From: Bruce Richardson @ 2024-05-28  8:20 UTC (permalink / raw)
  To: Mattias Rönnblom; +Cc: dev, hofors, Morten Brørup, Stephen Hemminger

On Tue, May 28, 2024 at 09:43:54AM +0200, Mattias Rönnblom wrote:
> Provide build option to have functions in <rte_memcpy.h> delegate to
> the standard compiler/libc memcpy(), instead of using the various
> traditional, handcrafted, per-architecture rte_memcpy()
> implementations.
> 
> A new meson build option 'use_cc_memcpy' is added. The default is
> true. It's not obvious what should be the default, but compiler
> memcpy() is enabled by default in this RFC so any tests run with this
> patch use the new approach.
> 
> One purpose of this RFC is to make it easy to evaluate the costs and
> benefits of a switch.
> 
> Only Loongarch, ARM and x86 is implemented. Only x86 is tested.
> 
> RFC v2:
>  * Fix bug where rte_memcpy.h was not installed on x86.
>  * Made attempt to make Loongarch compile.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  config/meson.build                     |  1 +
>  lib/eal/arm/include/rte_memcpy.h       | 10 +++++
>  lib/eal/include/generic/rte_memcpy.h   | 62 +++++++++++++++++++++++---
>  lib/eal/loongarch/include/rte_memcpy.h | 52 ++-------------------
>  lib/eal/x86/include/meson.build        |  1 +
>  lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
>  meson_options.txt                      |  2 +
>  7 files changed, 82 insertions(+), 57 deletions(-)
> 

I really support the long-term goal here of eliminating the need for us to
maintain our own memcpy. This looks a good idea to see how things perform.
If we do decide to take this patch, having the default be regular memcpy
should help with static analysis and other tooling, which would be aware of
memcpy but not rte_memcpy.

/Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  8:19   ` Mattias Rönnblom
@ 2024-05-28  8:27     ` Bruce Richardson
  2024-05-28  8:59       ` Mattias Rönnblom
  2024-05-28 14:59     ` Stephen Hemminger
  1 sibling, 1 reply; 128+ messages in thread
From: Bruce Richardson @ 2024-05-28  8:27 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup, Stephen Hemminger

On Tue, May 28, 2024 at 10:19:15AM +0200, Mattias Rönnblom wrote:
> On 2024-05-28 09:43, Mattias Rönnblom wrote:
> > Provide build option to have functions in <rte_memcpy.h> delegate to
> > the standard compiler/libc memcpy(), instead of using the various
> > traditional, handcrafted, per-architecture rte_memcpy()
> > implementations.
> > 
> > A new meson build option 'use_cc_memcpy' is added. The default is
> > true. It's not obvious what should be the default, but compiler
> > memcpy() is enabled by default in this RFC so any tests run with this
> > patch use the new approach.
> > 
> > One purpose of this RFC is to make it easy to evaluate the costs and
> > benefits of a switch.
> > 
> 
> I've tested this patch some with DSW micro benchmarks, and the result is a
> 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
> 
> We've also run characteristic test suite of a large, real world app. Here,
> we saw no effect. GCC 10.5.
> 
> x86_64 in both cases (Skylake and Raptor Lake).
> 
> Last time we did the same, there were a noticeable performance degradation
> in both the above cases.
> 
> This is not a lot of data points, but I think it we should consider making
> the custom RTE memcpy() implementations optional in the next release, and if
> no-one complains, remove the implementations in the next release.
> 
> (Whether or not [or how long] to keep the wrapper API is another question.)
> 
> <snip>

The other instance I've heard mention of in the past is virtio/vhost, which
used to have a speedup from the custom memcpy.

My own thinking on these cases, is that for targetted settings like these,
we should look to have local memcpy functions written - taking account of
the specifics of each usecase. For virtio/vhost for example, we can have
assumptions around host buffer alignment, and we also can be pretty
confident we are copying to another CPU. For DSW, or other eventdev cases,
we would only be looking at copies of multiples of 16, with guaranteed
8-byte alignment on both source and destination. Writing efficient copy fns
for specific scenarios can be faster and more effective than trying to
write a general, optimized in all cases, memcpy. It also discourages the
use of non-libc memcpy except where really necessary.

Naturally, if we find there are a lot of cases where use of libc memcpy
slows us down, we will want to keep a general rte_memcpy. However, I'd hope
the slowdown cases are very few.

/Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  8:27     ` Bruce Richardson
@ 2024-05-28  8:59       ` Mattias Rönnblom
  2024-05-28  9:07         ` Morten Brørup
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-28  8:59 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Mattias Rönnblom, dev, Morten Brørup, Stephen Hemminger

On 2024-05-28 10:27, Bruce Richardson wrote:
> On Tue, May 28, 2024 at 10:19:15AM +0200, Mattias Rönnblom wrote:
>> On 2024-05-28 09:43, Mattias Rönnblom wrote:
>>> Provide build option to have functions in <rte_memcpy.h> delegate to
>>> the standard compiler/libc memcpy(), instead of using the various
>>> traditional, handcrafted, per-architecture rte_memcpy()
>>> implementations.
>>>
>>> A new meson build option 'use_cc_memcpy' is added. The default is
>>> true. It's not obvious what should be the default, but compiler
>>> memcpy() is enabled by default in this RFC so any tests run with this
>>> patch use the new approach.
>>>
>>> One purpose of this RFC is to make it easy to evaluate the costs and
>>> benefits of a switch.
>>>
>>
>> I've tested this patch some with DSW micro benchmarks, and the result is a
>> 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
>>
>> We've also run characteristic test suite of a large, real world app. Here,
>> we saw no effect. GCC 10.5.
>>
>> x86_64 in both cases (Skylake and Raptor Lake).
>>
>> Last time we did the same, there were a noticeable performance degradation
>> in both the above cases.
>>
>> This is not a lot of data points, but I think it we should consider making
>> the custom RTE memcpy() implementations optional in the next release, and if
>> no-one complains, remove the implementations in the next release.
>>
>> (Whether or not [or how long] to keep the wrapper API is another question.)
>>
>> <snip>
> 
> The other instance I've heard mention of in the past is virtio/vhost, which
> used to have a speedup from the custom memcpy.
> 
> My own thinking on these cases, is that for targetted settings like these,
> we should look to have local memcpy functions written - taking account of
> the specifics of each usecase. For virtio/vhost for example, we can have
> assumptions around host buffer alignment, and we also can be pretty
> confident we are copying to another CPU. For DSW, or other eventdev cases,
> we would only be looking at copies of multiples of 16, with guaranteed
> 8-byte alignment on both source and destination. Writing efficient copy fns

In such cases, you should first try to tell the compiler that it's safe 
to assume that the pointers have a certain alignment.

void copy256(void *dst, const void *src)
{
     memcpy(dst, src, 256);
}

void copy256_a(void *dst, const void *src)
{
     void *dst_a = __builtin_assume_aligned(dst, 32);
     const void *src_a = __builtin_assume_aligned(src, 32);
     memcpy(dst_a, src_a, 256);
}

The first will generate loads/stores without alignment restrictions, 
while the latter will use things like vmovdqa or vmovaps.

(I doubt there's much of a performance difference though, if any at all.)

> for specific scenarios can be faster and more effective than trying to
> write a general, optimized in all cases, memcpy. It also discourages the
> use of non-libc memcpy except where really necessary.
> 
> Naturally, if we find there are a lot of cases where use of libc memcpy
> slows us down, we will want to keep a general rte_memcpy. However, I'd hope
> the slowdown cases are very few.
> 
> /Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  8:59       ` Mattias Rönnblom
@ 2024-05-28  9:07         ` Morten Brørup
  2024-05-28 16:17           ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Morten Brørup @ 2024-05-28  9:07 UTC (permalink / raw)
  To: Mattias Rönnblom, Bruce Richardson
  Cc: Mattias Rönnblom, dev, Stephen Hemminger

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Tuesday, 28 May 2024 11.00
> 
> On 2024-05-28 10:27, Bruce Richardson wrote:
> > On Tue, May 28, 2024 at 10:19:15AM +0200, Mattias Rönnblom wrote:
> >> On 2024-05-28 09:43, Mattias Rönnblom wrote:
> >>> Provide build option to have functions in <rte_memcpy.h> delegate to
> >>> the standard compiler/libc memcpy(), instead of using the various
> >>> traditional, handcrafted, per-architecture rte_memcpy()
> >>> implementations.
> >>>
> >>> A new meson build option 'use_cc_memcpy' is added. The default is
> >>> true. It's not obvious what should be the default, but compiler
> >>> memcpy() is enabled by default in this RFC so any tests run with this
> >>> patch use the new approach.
> >>>
> >>> One purpose of this RFC is to make it easy to evaluate the costs and
> >>> benefits of a switch.
> >>>
> >>
> >> I've tested this patch some with DSW micro benchmarks, and the result is a
> >> 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
> >>
> >> We've also run characteristic test suite of a large, real world app. Here,
> >> we saw no effect. GCC 10.5.
> >>
> >> x86_64 in both cases (Skylake and Raptor Lake).
> >>
> >> Last time we did the same, there were a noticeable performance degradation
> >> in both the above cases.

Mattias, which compiler was that?

As previously mentioned in another thread, I'm worried about memcpy performance with older compilers.
DPDK officially supports GCC 4.9 and clang 3.4 [1].
I don't think degrading performance when using supported compilers is considered acceptable.

Alternatively, we could change the DPDK compiler policy from "supported" to "works with (but might not perform optimally)".

[1]: https://doc.dpdk.org/guides-21.11/linux_gsg/sys_reqs.html#compilation-of-the-dpdk

> >>
> >> This is not a lot of data points, but I think it we should consider making
> >> the custom RTE memcpy() implementations optional in the next release, and
> if
> >> no-one complains, remove the implementations in the next release.
> >>
> >> (Whether or not [or how long] to keep the wrapper API is another question.)
> >>
> >> <snip>
> >
> > The other instance I've heard mention of in the past is virtio/vhost, which
> > used to have a speedup from the custom memcpy.
> >
> > My own thinking on these cases, is that for targetted settings like these,
> > we should look to have local memcpy functions written - taking account of
> > the specifics of each usecase. For virtio/vhost for example, we can have
> > assumptions around host buffer alignment, and we also can be pretty
> > confident we are copying to another CPU. For DSW, or other eventdev cases,
> > we would only be looking at copies of multiples of 16, with guaranteed
> > 8-byte alignment on both source and destination. Writing efficient copy fns
> 
> In such cases, you should first try to tell the compiler that it's safe
> to assume that the pointers have a certain alignment.
> 
> void copy256(void *dst, const void *src)
> {
>      memcpy(dst, src, 256);
> }
> 
> void copy256_a(void *dst, const void *src)
> {
>      void *dst_a = __builtin_assume_aligned(dst, 32);
>      const void *src_a = __builtin_assume_aligned(src, 32);
>      memcpy(dst_a, src_a, 256);
> }
> 
> The first will generate loads/stores without alignment restrictions,
> while the latter will use things like vmovdqa or vmovaps.
> 
> (I doubt there's much of a performance difference though, if any at all.)

Interesting.

> 
> > for specific scenarios can be faster and more effective than trying to
> > write a general, optimized in all cases, memcpy. It also discourages the
> > use of non-libc memcpy except where really necessary.

Good idea, Bruce.
I have previously worked on an optimized memcpy, where information about alignment, multiples, non-temporal source/destination, etc. is passed as flags to the function [2]. But it turned into too much work, so I never finished it.

If we start with local memcpy functions optimized for each specific use case, we still have the option of consolidating them into a common rte_memcpy function later. It will also reveal which flags/features such a common function needs to support.

[2]: https://inbox.dpdk.org/dev/20221010064600.16495-1-mb@smartsharesystems.com/

> >
> > Naturally, if we find there are a lot of cases where use of libc memcpy
> > slows us down, we will want to keep a general rte_memcpy. However, I'd hope
> > the slowdown cases are very few.
> >
> > /Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  8:19   ` Mattias Rönnblom
  2024-05-28  8:27     ` Bruce Richardson
@ 2024-05-28 14:59     ` Stephen Hemminger
  2024-05-28 15:09       ` Bruce Richardson
  2024-05-28 16:03       ` Mattias Rönnblom
  1 sibling, 2 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-05-28 14:59 UTC (permalink / raw)
  To: Mattias Rönnblom; +Cc: Mattias Rönnblom, dev, Morten Brørup

On Tue, 28 May 2024 10:19:15 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> >   
> 
> I've tested this patch some with DSW micro benchmarks, and the result is 
> a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
> 
> We've also run characteristic test suite of a large, real world app. 
> Here, we saw no effect. GCC 10.5.
> 
> x86_64 in both cases (Skylake and Raptor Lake).
> 
> Last time we did the same, there were a noticeable performance 
> degradation in both the above cases.
> 
> This is not a lot of data points, but I think it we should consider 
> making the custom RTE memcpy() implementations optional in the next 
> release, and if no-one complains, remove the implementations in the next 
> release.

Lets go farther.

1. Announce that rte_memcpy will be marked deprecated in 24.11 release

2. In 24.11 do a global replace of rte_memcpy on the tree.
   And mark rte_memcpy as deprecated.

3. In 25.11 it can go away.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28 14:59     ` Stephen Hemminger
@ 2024-05-28 15:09       ` Bruce Richardson
  2024-05-31  5:19         ` Mattias Rönnblom
  2024-05-28 16:03       ` Mattias Rönnblom
  1 sibling, 1 reply; 128+ messages in thread
From: Bruce Richardson @ 2024-05-28 15:09 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Mattias Rönnblom, Mattias Rönnblom, dev, Morten Brørup

On Tue, May 28, 2024 at 07:59:36AM -0700, Stephen Hemminger wrote:
> On Tue, 28 May 2024 10:19:15 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
> > >   
> > 
> > I've tested this patch some with DSW micro benchmarks, and the result is 
> > a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
> > 
> > We've also run characteristic test suite of a large, real world app. 
> > Here, we saw no effect. GCC 10.5.
> > 
> > x86_64 in both cases (Skylake and Raptor Lake).
> > 
> > Last time we did the same, there were a noticeable performance 
> > degradation in both the above cases.
> > 
> > This is not a lot of data points, but I think it we should consider 
> > making the custom RTE memcpy() implementations optional in the next 
> > release, and if no-one complains, remove the implementations in the next 
> > release.
> 
> Lets go farther.
> 
> 1. Announce that rte_memcpy will be marked deprecated in 24.11 release
> 
> 2. In 24.11 do a global replace of rte_memcpy on the tree.
>    And mark rte_memcpy as deprecated.
> 
> 3. In 25.11 it can go away.

While I'd like us to be able to do so, I believe that to be premature. We
need to see where/if there are regressions first, and see about fixing
them.

/Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28 14:59     ` Stephen Hemminger
  2024-05-28 15:09       ` Bruce Richardson
@ 2024-05-28 16:03       ` Mattias Rönnblom
  2024-05-29 21:55         ` Stephen Hemminger
  1 sibling, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-28 16:03 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Mattias Rönnblom, dev, Morten Brørup

On 2024-05-28 16:59, Stephen Hemminger wrote:
> On Tue, 28 May 2024 10:19:15 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
>>>    
>>
>> I've tested this patch some with DSW micro benchmarks, and the result is
>> a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
>>
>> We've also run characteristic test suite of a large, real world app.
>> Here, we saw no effect. GCC 10.5.
>>
>> x86_64 in both cases (Skylake and Raptor Lake).
>>
>> Last time we did the same, there were a noticeable performance
>> degradation in both the above cases.
>>
>> This is not a lot of data points, but I think it we should consider
>> making the custom RTE memcpy() implementations optional in the next
>> release, and if no-one complains, remove the implementations in the next
>> release.
> 
> Lets go farther.
> 
> 1. Announce that rte_memcpy will be marked deprecated in 24.11 release
> 
> 2. In 24.11 do a global replace of rte_memcpy on the tree.
>     And mark rte_memcpy as deprecated.
> 
> 3. In 25.11 it can go away.

If/when rte_memcpy.h is just a tiny memcpy() wrapper, the maintenance 
burden is pretty much eliminated.

Keeping it around will allow for older applications to compile against 
newer DPDK version.

You can always discourage its use in the API documentation.

Also, hopefully, some day, we will have a non-temporal memcpy(), and 
those functions needs a home.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28  9:07         ` Morten Brørup
@ 2024-05-28 16:17           ` Mattias Rönnblom
  0 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-28 16:17 UTC (permalink / raw)
  To: Morten Brørup, Bruce Richardson
  Cc: Mattias Rönnblom, dev, Stephen Hemminger

On 2024-05-28 11:07, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Tuesday, 28 May 2024 11.00
>>
>> On 2024-05-28 10:27, Bruce Richardson wrote:
>>> On Tue, May 28, 2024 at 10:19:15AM +0200, Mattias Rönnblom wrote:
>>>> On 2024-05-28 09:43, Mattias Rönnblom wrote:
>>>>> Provide build option to have functions in <rte_memcpy.h> delegate to
>>>>> the standard compiler/libc memcpy(), instead of using the various
>>>>> traditional, handcrafted, per-architecture rte_memcpy()
>>>>> implementations.
>>>>>
>>>>> A new meson build option 'use_cc_memcpy' is added. The default is
>>>>> true. It's not obvious what should be the default, but compiler
>>>>> memcpy() is enabled by default in this RFC so any tests run with this
>>>>> patch use the new approach.
>>>>>
>>>>> One purpose of this RFC is to make it easy to evaluate the costs and
>>>>> benefits of a switch.
>>>>>
>>>>
>>>> I've tested this patch some with DSW micro benchmarks, and the result is a
>>>> 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
>>>>
>>>> We've also run characteristic test suite of a large, real world app. Here,
>>>> we saw no effect. GCC 10.5.
>>>>
>>>> x86_64 in both cases (Skylake and Raptor Lake).
>>>>
>>>> Last time we did the same, there were a noticeable performance degradation
>>>> in both the above cases.
> 
> Mattias, which compiler was that?
> 

GCC 9, I think.

Not only the compiler changed between those two test runs.

It would be interesting with some ARM data points as well.

> As previously mentioned in another thread, I'm worried about memcpy performance with older compilers.
> DPDK officially supports GCC 4.9 and clang 3.4 [1].
> I don't think degrading performance when using supported compilers is considered acceptable.
> 
> Alternatively, we could change the DPDK compiler policy from "supported" to "works with (but might not perform optimally)".
> 

GCC 4.9 is ten years old.

If you are using an old compiler, odds are you don't really care too 
much about squeezing out max performance, considering how much better 
code generation is in newer compilers.

That said, we obviously don't want to cause large performance 
regressions for no good reason, even for old compilers.

> [1]: https://doc.dpdk.org/guides-21.11/linux_gsg/sys_reqs.html#compilation-of-the-dpdk
> 
>>>>
>>>> This is not a lot of data points, but I think it we should consider making
>>>> the custom RTE memcpy() implementations optional in the next release, and
>> if
>>>> no-one complains, remove the implementations in the next release.
>>>>
>>>> (Whether or not [or how long] to keep the wrapper API is another question.)
>>>>
>>>> <snip>
>>>
>>> The other instance I've heard mention of in the past is virtio/vhost, which
>>> used to have a speedup from the custom memcpy.
>>>
>>> My own thinking on these cases, is that for targetted settings like these,
>>> we should look to have local memcpy functions written - taking account of
>>> the specifics of each usecase. For virtio/vhost for example, we can have
>>> assumptions around host buffer alignment, and we also can be pretty
>>> confident we are copying to another CPU. For DSW, or other eventdev cases,
>>> we would only be looking at copies of multiples of 16, with guaranteed
>>> 8-byte alignment on both source and destination. Writing efficient copy fns
>>
>> In such cases, you should first try to tell the compiler that it's safe
>> to assume that the pointers have a certain alignment.
>>
>> void copy256(void *dst, const void *src)
>> {
>>       memcpy(dst, src, 256);
>> }
>>
>> void copy256_a(void *dst, const void *src)
>> {
>>       void *dst_a = __builtin_assume_aligned(dst, 32);
>>       const void *src_a = __builtin_assume_aligned(src, 32);
>>       memcpy(dst_a, src_a, 256);
>> }
>>
>> The first will generate loads/stores without alignment restrictions,
>> while the latter will use things like vmovdqa or vmovaps.
>>
>> (I doubt there's much of a performance difference though, if any at all.)
> 
> Interesting.
> 
>>
>>> for specific scenarios can be faster and more effective than trying to
>>> write a general, optimized in all cases, memcpy. It also discourages the
>>> use of non-libc memcpy except where really necessary.
> 
> Good idea, Bruce.
> I have previously worked on an optimized memcpy, where information about alignment, multiples, non-temporal source/destination, etc. is passed as flags to the function [2]. But it turned into too much work, so I never finished it.
> 
> If we start with local memcpy functions optimized for each specific use case, we still have the option of consolidating them into a common rte_memcpy function later. It will also reveal which flags/features such a common function needs to support.
> 
> [2]: https://inbox.dpdk.org/dev/20221010064600.16495-1-mb@smartsharesystems.com/
> 
>>>
>>> Naturally, if we find there are a lot of cases where use of libc memcpy
>>> slows us down, we will want to keep a general rte_memcpy. However, I'd hope
>>> the slowdown cases are very few.
>>>
>>> /Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28 16:03       ` Mattias Rönnblom
@ 2024-05-29 21:55         ` Stephen Hemminger
  0 siblings, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-05-29 21:55 UTC (permalink / raw)
  To: Mattias Rönnblom; +Cc: Mattias Rönnblom, dev, Morten Brørup

On Tue, 28 May 2024 18:03:27 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > Lets go farther.
> > 
> > 1. Announce that rte_memcpy will be marked deprecated in 24.11 release
> > 
> > 2. In 24.11 do a global replace of rte_memcpy on the tree.
> >     And mark rte_memcpy as deprecated.
> > 
> > 3. In 25.11 it can go away.  
> 
> If/when rte_memcpy.h is just a tiny memcpy() wrapper, the maintenance 
> burden is pretty much eliminated.
> 
> Keeping it around will allow for older applications to compile against 
> newer DPDK version.
> 
> You can always discourage its use in the API documentation.
> 
> Also, hopefully, some day, we will have a non-temporal memcpy(), and 
> those functions needs a home.

I was thinking a wrapper but have it marked __rte_deprecated so that
warnings result and people are motivated to fix.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC] eal: provide option to use compiler memcpy instead of RTE
  2024-05-27 11:11 [RFC] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  2024-05-28  7:43 ` [RFC v2] " Mattias Rönnblom
@ 2024-05-29 21:56 ` Stephen Hemminger
  2024-06-02 11:30   ` Mattias Rönnblom
  1 sibling, 1 reply; 128+ messages in thread
From: Stephen Hemminger @ 2024-05-29 21:56 UTC (permalink / raw)
  To: Mattias Rönnblom; +Cc: dev, hofors, Morten Brørup

On Mon, 27 May 2024 13:11:51 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:

> #ifdef RTE_USE_CC_MEMCPY
> +static inline void
> +rte_mov16(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 16);
> +}
> +
> +static inline void
> +rte_mov32(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 32);
> +}
> +
> +static inline void
> +rte_mov48(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 48);
> +}
> +
> +static inline void
> +rte_mov64(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 64);
> +}
> +
> +static inline void
> +rte_mov128(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 128);
> +}
> +
> +static inline void
> +rte_mov256(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 256);
> +}
> +
> +static inline void *
> +rte_memcpy(void *dst, const void *src, size_t n)
> +{
> +	return memcpy(dst, src, n);
> +}
> +#endif /* RTE_USE_CC_MEMCPY */
> +
> +#ifdef __cplusplus
> +}
> +#endif

You may need to make these macros to fully engage the checking
options of GCC, fortify, coverity etc. Not sure if all the tools
are smart enough to see through an inline.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-28 15:09       ` Bruce Richardson
@ 2024-05-31  5:19         ` Mattias Rönnblom
  2024-05-31 16:50           ` Stephen Hemminger
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-05-31  5:19 UTC (permalink / raw)
  To: Bruce Richardson, Stephen Hemminger
  Cc: Mattias Rönnblom, dev, Morten Brørup

On 2024-05-28 17:09, Bruce Richardson wrote:
> On Tue, May 28, 2024 at 07:59:36AM -0700, Stephen Hemminger wrote:
>> On Tue, 28 May 2024 10:19:15 +0200
>> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
>>
>>>>    
>>>
>>> I've tested this patch some with DSW micro benchmarks, and the result is
>>> a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
>>>
>>> We've also run characteristic test suite of a large, real world app.
>>> Here, we saw no effect. GCC 10.5.
>>>
>>> x86_64 in both cases (Skylake and Raptor Lake).
>>>
>>> Last time we did the same, there were a noticeable performance
>>> degradation in both the above cases.
>>>
>>> This is not a lot of data points, but I think it we should consider
>>> making the custom RTE memcpy() implementations optional in the next
>>> release, and if no-one complains, remove the implementations in the next
>>> release.
>>
>> Lets go farther.
>>
>> 1. Announce that rte_memcpy will be marked deprecated in 24.11 release
>>
>> 2. In 24.11 do a global replace of rte_memcpy on the tree.
>>     And mark rte_memcpy as deprecated.
>>
>> 3. In 25.11 it can go away.
> 
> While I'd like us to be able to do so, I believe that to be premature. We
> need to see where/if there are regressions first, and see about fixing
> them.
> 
> /Bruce

Should I turn this RFC into a PATCH?

Is use_cc_memcpy a good name for the configuration parameter?


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-31  5:19         ` Mattias Rönnblom
@ 2024-05-31 16:50           ` Stephen Hemminger
  2024-06-02 11:33             ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Stephen Hemminger @ 2024-05-31 16:50 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Bruce Richardson, Mattias Rönnblom, dev, Morten Brørup

On Fri, 31 May 2024 07:19:41 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> On 2024-05-28 17:09, Bruce Richardson wrote:
> > On Tue, May 28, 2024 at 07:59:36AM -0700, Stephen Hemminger wrote:  
> >> On Tue, 28 May 2024 10:19:15 +0200
> >> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> >>  
> >>>>      
> >>>
> >>> I've tested this patch some with DSW micro benchmarks, and the result is
> >>> a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
> >>>
> >>> We've also run characteristic test suite of a large, real world app.
> >>> Here, we saw no effect. GCC 10.5.
> >>>
> >>> x86_64 in both cases (Skylake and Raptor Lake).
> >>>
> >>> Last time we did the same, there were a noticeable performance
> >>> degradation in both the above cases.
> >>>
> >>> This is not a lot of data points, but I think it we should consider
> >>> making the custom RTE memcpy() implementations optional in the next
> >>> release, and if no-one complains, remove the implementations in the next
> >>> release.  
> >>
> >> Lets go farther.
> >>
> >> 1. Announce that rte_memcpy will be marked deprecated in 24.11 release
> >>
> >> 2. In 24.11 do a global replace of rte_memcpy on the tree.
> >>     And mark rte_memcpy as deprecated.
> >>
> >> 3. In 25.11 it can go away.  
> > 
> > While I'd like us to be able to do so, I believe that to be premature. We
> > need to see where/if there are regressions first, and see about fixing
> > them.
> > 
> > /Bruce  
> 
> Should I turn this RFC into a PATCH?
> 
> Is use_cc_memcpy a good name for the configuration parameter?
> 

I did a slightly more direct test and found a couple of things:
   1. Ena driver is redefining memcpy as rte_memcpy, this should be removed and should have
      been blocked during code review.
   2. A couple of drivers are implicitly expecting simd vector routines to be available.
      This works because rte_memcpy.h includes rte_vect.h.  The fix is to have these
      places include rte_vect.h


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC] eal: provide option to use compiler memcpy instead of RTE
  2024-05-29 21:56 ` [RFC] " Stephen Hemminger
@ 2024-06-02 11:30   ` Mattias Rönnblom
  0 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 11:30 UTC (permalink / raw)
  To: Stephen Hemminger, Mattias Rönnblom; +Cc: dev, Morten Brørup

On 2024-05-29 23:56, Stephen Hemminger wrote:
> On Mon, 27 May 2024 13:11:51 +0200
> Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:
> 
>> #ifdef RTE_USE_CC_MEMCPY
>> +static inline void
>> +rte_mov16(uint8_t *dst, const uint8_t *src)
>> +{
>> +	memcpy(dst, src, 16);
>> +}
>> +
>> +static inline void
>> +rte_mov32(uint8_t *dst, const uint8_t *src)
>> +{
>> +	memcpy(dst, src, 32);
>> +}
>> +
>> +static inline void
>> +rte_mov48(uint8_t *dst, const uint8_t *src)
>> +{
>> +	memcpy(dst, src, 48);
>> +}
>> +
>> +static inline void
>> +rte_mov64(uint8_t *dst, const uint8_t *src)
>> +{
>> +	memcpy(dst, src, 64);
>> +}
>> +
>> +static inline void
>> +rte_mov128(uint8_t *dst, const uint8_t *src)
>> +{
>> +	memcpy(dst, src, 128);
>> +}
>> +
>> +static inline void
>> +rte_mov256(uint8_t *dst, const uint8_t *src)
>> +{
>> +	memcpy(dst, src, 256);
>> +}
>> +
>> +static inline void *
>> +rte_memcpy(void *dst, const void *src, size_t n)
>> +{
>> +	return memcpy(dst, src, n);
>> +}
>> +#endif /* RTE_USE_CC_MEMCPY */
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
> 
> You may need to make these macros to fully engage the checking
> options of GCC, fortify, coverity etc. Not sure if all the tools
> are smart enough to see through an inline.

At least GCC is, provided you compile with optimization enabled. That 
goes for both overlapping memcpy() warning and static buffer overruns.

clang doesn't warn about overlapping memcpy() and fails to follow 
function calls, even with optimization enabled, seemingly. Same for ICX.

Static analysis tools that can't beat the compiler seems like they would 
be of limited use.

With macros you'll lose the type checking, which in the rte_memcpy() 
case doesn't matter, but rte_mov*() case it does.

I'll sure it break something to change this to macros, although it would 
be good if clang would also generate a warning (for application code 
using <rte_memcpy.h>).

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v2] eal: provide option to use compiler memcpy instead of RTE
  2024-05-31 16:50           ` Stephen Hemminger
@ 2024-06-02 11:33             ` Mattias Rönnblom
  0 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 11:33 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Bruce Richardson, Mattias Rönnblom, dev, Morten Brørup

On 2024-05-31 18:50, Stephen Hemminger wrote:
> On Fri, 31 May 2024 07:19:41 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
>> On 2024-05-28 17:09, Bruce Richardson wrote:
>>> On Tue, May 28, 2024 at 07:59:36AM -0700, Stephen Hemminger wrote:
>>>> On Tue, 28 May 2024 10:19:15 +0200
>>>> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
>>>>   
>>>>>>       
>>>>>
>>>>> I've tested this patch some with DSW micro benchmarks, and the result is
>>>>> a 2.5% reduction of the DSW+testapp overhead with cc/libc memcpy. GCC 11.4.
>>>>>
>>>>> We've also run characteristic test suite of a large, real world app.
>>>>> Here, we saw no effect. GCC 10.5.
>>>>>
>>>>> x86_64 in both cases (Skylake and Raptor Lake).
>>>>>
>>>>> Last time we did the same, there were a noticeable performance
>>>>> degradation in both the above cases.
>>>>>
>>>>> This is not a lot of data points, but I think it we should consider
>>>>> making the custom RTE memcpy() implementations optional in the next
>>>>> release, and if no-one complains, remove the implementations in the next
>>>>> release.
>>>>
>>>> Lets go farther.
>>>>
>>>> 1. Announce that rte_memcpy will be marked deprecated in 24.11 release
>>>>
>>>> 2. In 24.11 do a global replace of rte_memcpy on the tree.
>>>>      And mark rte_memcpy as deprecated.
>>>>
>>>> 3. In 25.11 it can go away.
>>>
>>> While I'd like us to be able to do so, I believe that to be premature. We
>>> need to see where/if there are regressions first, and see about fixing
>>> them.
>>>
>>> /Bruce
>>
>> Should I turn this RFC into a PATCH?
>>
>> Is use_cc_memcpy a good name for the configuration parameter?
>>
> 
> I did a slightly more direct test and found a couple of things:
>     1. Ena driver is redefining memcpy as rte_memcpy, this should be removed and should have
>        been blocked during code review.

Wouldn't that hack continue to work? Provided rte_memcpy() is a 
function, and the <rte_memcpy.h> header is included prior to the memcpy 
redefinition.

>     2. A couple of drivers are implicitly expecting simd vector routines to be available.
>        This works because rte_memcpy.h includes rte_vect.h.  The fix is to have these
>        places include rte_vect.h
> 

I noticed this as well. I'll add patches for those drivers.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy
  2024-05-28  7:43 ` [RFC v2] " Mattias Rönnblom
  2024-05-28  8:19   ` Mattias Rönnblom
  2024-05-28  8:20   ` Bruce Richardson
@ 2024-06-02 12:39   ` Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
                       ` (4 more replies)
  2 siblings, 5 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 12:39 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

This patch set make DPDK library, driver, and application code use the
compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
invoked.

The various custom DPDK rte_memcpy() implementations may be retained
by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

Mattias Rönnblom (5):
  event/dlb2: include headers for vector and memory copy APIs
  net/octeon_ep: properly include vector API header file
  distributor: properly include vector API header file
  fib: properly include vector API header file
  eal: provide option to use compiler memcpy instead of RTE

 config/meson.build                     |  1 +
 drivers/event/dlb2/dlb2.c              |  2 +
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  2 +
 lib/distributor/rte_distributor.c      |  1 +
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 lib/fib/trie.c                         |  1 +
 meson_options.txt                      |  2 +
 13 files changed, 102 insertions(+), 106 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v3 1/5] event/dlb2: include headers for vector and memory copy APIs
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-06-02 12:39     ` Mattias Rönnblom
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 2/5] net/octeon_ep: properly include vector API header file Mattias Rönnblom
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 12:39 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
of <rte_memcpy.h> being included.

In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
but rather depended on other include files to do so.

This patch addresses both of those issues.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/event/dlb2/dlb2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..19f90b8f8d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -25,11 +25,13 @@
 #include <rte_kvargs.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_memcpy.h>
 #include <rte_mbuf.h>
 #include <rte_power_intrinsics.h>
 #include <rte_prefetch.h>
 #include <rte_ring.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "dlb2_priv.h"
 #include "dlb2_iface.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v3 2/5] net/octeon_ep: properly include vector API header file
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-02 12:39     ` Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 3/5] distributor: " Mattias Rönnblom
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 12:39 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v3 3/5] distributor: properly include vector API header file
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 2/5] net/octeon_ep: properly include vector API header file Mattias Rönnblom
@ 2024-06-02 12:39     ` Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 4/5] fib: " Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  4 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 12:39 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v3 4/5] fib: properly include vector API header file
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                       ` (2 preceding siblings ...)
  2024-06-02 12:39     ` [RFC v3 3/5] distributor: " Mattias Rönnblom
@ 2024-06-02 12:39     ` Mattias Rönnblom
  2024-06-02 12:39     ` [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  4 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 12:39 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE
  2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                       ` (3 preceding siblings ...)
  2024-06-02 12:39     ` [RFC v3 4/5] fib: " Mattias Rönnblom
@ 2024-06-02 12:39     ` Mattias Rönnblom
  2024-06-02 20:58       ` Morten Brørup
  4 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-02 12:39 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default,
the compiler/libc memcpy() is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the usage of a custom memcpy() implementation
may even be a liability.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
memcpy() (e.g., buffer overruns, or overlapping source and destination
buffers).

This patch makes DPDK and DPDK applications using <rte_memcpy.h> use
compiler/libc memcpy() by default, but leaves an option to stay on the
custom DPDK implementations, would that prove beneficial for certain
applications or architectures.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 config/meson.build                     |  1 +
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 9 files changed, 96 insertions(+), 106 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cae06117fb 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..344b4416b5 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,12 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..645fd83986 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,14 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +223,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..4acdc4af5f 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,12 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..263b0e7882 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have rte_memcpy() delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE
  2024-06-02 12:39     ` [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-06-02 20:58       ` Morten Brørup
  2024-06-03 17:04         ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Morten Brørup @ 2024-06-02 20:58 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: hofors, Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin

> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
> Sent: Sunday, 2 June 2024 14.39
> 
> Provide build option to have functions in <rte_memcpy.h> delegate to
> the standard compiler/libc memcpy(), instead of using the various
> custom DPDK, handcrafted, per-architecture rte_memcpy()
> implementations.
> 
> A new meson build option 'use_cc_memcpy' is added. By default,
> the compiler/libc memcpy() is used.
> 
> The performance benefits of the custom DPDK rte_memcpy()
> implementations have been diminishing with every compiler release, and
> with current toolchains the usage of a custom memcpy() implementation
> may even be a liability.
> 
> An additional benefit of this change is that compilers and static
> analysis tools have an easier time detecting incorrect usage of
> memcpy() (e.g., buffer overruns, or overlapping source and destination
> buffers).
> 
> This patch makes DPDK and DPDK applications using <rte_memcpy.h> use
> compiler/libc memcpy() by default, but leaves an option to stay on the
> custom DPDK implementations, would that prove beneficial for certain
> applications or architectures.
> 
> RFC v3:
>  o Fix missing #endif on loongarch.
>  o PPC and RISCV now implemented, meaning all architectures are
> supported.
>  o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.
> 
> RFC v2:
>  * Fix bug where rte_memcpy.h was not installed on x86.
>  * Made attempt to make Loongarch compile.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---

We should keep pushing DPDK forward and cleaning up old cruft along the way.

The memcpy discussion has convinced me that:
1. This change is a good idea, and
2. Mainstream compilers are sufficiently mature to do it now.

So, for the series,
Acked-by: Morten Brørup <mb@smartsharesystems.com>

>  static inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src);

While at it, would it be somehow beneficial to change these from uint8_t* to void* or char* (keeping const where relevant)?


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE
  2024-06-02 20:58       ` Morten Brørup
@ 2024-06-03 17:04         ` Mattias Rönnblom
  2024-06-03 17:08           ` Stephen Hemminger
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-03 17:04 UTC (permalink / raw)
  To: Morten Brørup, Mattias Rönnblom, dev
  Cc: Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin

On 2024-06-02 22:58, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
>> Sent: Sunday, 2 June 2024 14.39
>>
>> Provide build option to have functions in <rte_memcpy.h> delegate to
>> the standard compiler/libc memcpy(), instead of using the various
>> custom DPDK, handcrafted, per-architecture rte_memcpy()
>> implementations.
>>
>> A new meson build option 'use_cc_memcpy' is added. By default,
>> the compiler/libc memcpy() is used.
>>
>> The performance benefits of the custom DPDK rte_memcpy()
>> implementations have been diminishing with every compiler release, and
>> with current toolchains the usage of a custom memcpy() implementation
>> may even be a liability.
>>
>> An additional benefit of this change is that compilers and static
>> analysis tools have an easier time detecting incorrect usage of
>> memcpy() (e.g., buffer overruns, or overlapping source and destination
>> buffers).
>>
>> This patch makes DPDK and DPDK applications using <rte_memcpy.h> use
>> compiler/libc memcpy() by default, but leaves an option to stay on the
>> custom DPDK implementations, would that prove beneficial for certain
>> applications or architectures.
>>
>> RFC v3:
>>   o Fix missing #endif on loongarch.
>>   o PPC and RISCV now implemented, meaning all architectures are
>> supported.
>>   o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.
>>
>> RFC v2:
>>   * Fix bug where rte_memcpy.h was not installed on x86.
>>   * Made attempt to make Loongarch compile.
>>
>> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> ---
> 
> We should keep pushing DPDK forward and cleaning up old cruft along the way.
> 
> The memcpy discussion has convinced me that:
> 1. This change is a good idea, and
> 2. Mainstream compilers are sufficiently mature to do it now.
> 
> So, for the series,
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 
>>   static inline void
>>   rte_mov32(uint8_t *dst, const uint8_t *src);
> 
> While at it, would it be somehow beneficial to change these from uint8_t* to void* or char* (keeping const where relevant)?
> 

Something to consider before doing such a change would be if it may 
cause any strict aliasing issue for existing users.

If we should break the API, I think we are better off removing 
rte_mov*() altogether.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE
  2024-06-03 17:04         ` Mattias Rönnblom
@ 2024-06-03 17:08           ` Stephen Hemminger
  0 siblings, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-03 17:08 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Morten Brørup, Mattias Rönnblom, dev,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Mon, 3 Jun 2024 19:04:49 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> >>   static inline void
> >>   rte_mov32(uint8_t *dst, const uint8_t *src);  
> > 
> > While at it, would it be somehow beneficial to change these from uint8_t* to void* or char* (keeping const where relevant)?
> >   
> 
> Something to consider before doing such a change would be if it may 
> cause any strict aliasing issue for existing users.
> 
> If we should break the API, I think we are better off removing 
> rte_mov*() altogether.

Why modify soon to be dead code.
These should marked as deprecated in 24.11

^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-02 12:39     ` [RFC v3 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-05  6:49       ` Mattias Rönnblom
  2024-06-05  6:49         ` [PATCH 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
                           ` (5 more replies)
  0 siblings, 6 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-05  6:49 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

This patch set make DPDK library, driver, and application code use the
compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
invoked.

The various custom DPDK rte_memcpy() implementations may be retained
by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

Mattias Rönnblom (5):
  event/dlb2: include headers for vector and memory copy APIs
  net/octeon_ep: properly include vector API header file
  distributor: properly include vector API header file
  fib: properly include vector API header file
  eal: provide option to use compiler memcpy instead of RTE

 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 drivers/event/dlb2/dlb2.c              |  2 +
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  2 +
 lib/distributor/rte_distributor.c      |  1 +
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 lib/fib/trie.c                         |  1 +
 meson_options.txt                      |  2 +
 14 files changed, 123 insertions(+), 106 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH 1/5] event/dlb2: include headers for vector and memory copy APIs
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-06-05  6:49         ` Mattias Rönnblom
  2024-06-05  6:49         ` [PATCH 2/5] net/octeon_ep: properly include vector API header file Mattias Rönnblom
                           ` (4 subsequent siblings)
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-05  6:49 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
of <rte_memcpy.h> being included.

In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
but rather depended on other include files to do so.

This patch addresses both of those issues.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/event/dlb2/dlb2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..19f90b8f8d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -25,11 +25,13 @@
 #include <rte_kvargs.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_memcpy.h>
 #include <rte_mbuf.h>
 #include <rte_power_intrinsics.h>
 #include <rte_prefetch.h>
 #include <rte_ring.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "dlb2_priv.h"
 #include "dlb2_iface.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH 2/5] net/octeon_ep: properly include vector API header file
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-05  6:49         ` [PATCH 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-05  6:49         ` Mattias Rönnblom
  2024-06-05  6:49         ` [PATCH 3/5] distributor: " Mattias Rönnblom
                           ` (3 subsequent siblings)
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-05  6:49 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH 3/5] distributor: properly include vector API header file
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-05  6:49         ` [PATCH 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
  2024-06-05  6:49         ` [PATCH 2/5] net/octeon_ep: properly include vector API header file Mattias Rönnblom
@ 2024-06-05  6:49         ` Mattias Rönnblom
  2024-06-10 14:27           ` Tyler Retzlaff
  2024-06-05  6:49         ` [PATCH 4/5] fib: " Mattias Rönnblom
                           ` (2 subsequent siblings)
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-05  6:49 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH 4/5] fib: properly include vector API header file
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                           ` (2 preceding siblings ...)
  2024-06-05  6:49         ` [PATCH 3/5] distributor: " Mattias Rönnblom
@ 2024-06-05  6:49         ` Mattias Rönnblom
  2024-06-10 14:28           ` Tyler Retzlaff
  2024-06-05  6:49         ` [PATCH 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-05  6:49 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH 5/5] eal: provide option to use compiler memcpy instead of RTE
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                           ` (3 preceding siblings ...)
  2024-06-05  6:49         ` [PATCH 4/5] fib: " Mattias Rönnblom
@ 2024-06-05  6:49         ` Mattias Rönnblom
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-05  6:49 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default,
the compiler/libc memcpy() is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

This patch leaves an option to stay on the custom DPDK implementations,
would that prove beneficial for certain applications or architectures.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 10 files changed, 117 insertions(+), 106 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index a69f24cf99..4b6eafa86e 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -24,6 +24,27 @@ DPDK Release 24.07
 New Features
 ------------
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now delegates to the
+  standard memcpy() function, implemented by the compiler and the C
+  runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still available, and may be reactivated by
+  setting the new ``use_cc_memcpy`` build option to false.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of this change is that compilers and static
+  analysis tools have an easier time detecting incorrect usage of
+  rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
+
 .. This section should contain new features added in this release.
    Sample format:
 
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cae06117fb 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..344b4416b5 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,12 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..645fd83986 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,14 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +223,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..4acdc4af5f 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,12 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..06f544b631 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH 3/5] distributor: properly include vector API header file
  2024-06-05  6:49         ` [PATCH 3/5] distributor: " Mattias Rönnblom
@ 2024-06-10 14:27           ` Tyler Retzlaff
  0 siblings, 0 replies; 128+ messages in thread
From: Tyler Retzlaff @ 2024-06-10 14:27 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Wed, Jun 05, 2024 at 08:49:43AM +0200, Mattias Rönnblom wrote:
> The distributor library relied on <rte_vect.h>, but failed to provide
> a direct include of this file.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
Acked-by: Tyler Retzlaff <roretzla@linux.microsoft.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH 4/5] fib: properly include vector API header file
  2024-06-05  6:49         ` [PATCH 4/5] fib: " Mattias Rönnblom
@ 2024-06-10 14:28           ` Tyler Retzlaff
  0 siblings, 0 replies; 128+ messages in thread
From: Tyler Retzlaff @ 2024-06-10 14:28 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Wed, Jun 05, 2024 at 08:49:44AM +0200, Mattias Rönnblom wrote:
> The trie implementation of the fib library relied on <rte_vect.h>, but
> failed to provide a direct include of this file.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
Acked-by: Tyler Retzlaff <roretzla@linux.microsoft.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                           ` (4 preceding siblings ...)
  2024-06-05  6:49         ` [PATCH 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-06-20  7:24         ` Mattias Rönnblom
  2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
                             ` (5 more replies)
  5 siblings, 6 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

This patch set make DPDK library, driver, and application code use the
compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
invoked.

The various custom DPDK rte_memcpy() implementations may be retained
by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

This patch set includes a number of fixes in drivers and libraries
which errornously relied on <rte_memcpy.h> including other header
files (e.g., <rte_vect.h>) required by its implementation.

Mattias Rönnblom (6):
  net/fm10k: add missing intrinsic include
  event/dlb2: include headers for vector and memory copy APIs
  net/octeon_ep: properly include vector API header file
  distributor: properly include vector API header file
  fib: properly include vector API header file
  eal: provide option to use compiler memcpy instead of RTE

 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 drivers/event/dlb2/dlb2.c              |  2 +
 drivers/net/fm10k/fm10k_rxtx_vec.c     |  1 +
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  2 +
 lib/distributor/rte_distributor.c      |  1 +
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 lib/fib/trie.c                         |  1 +
 meson_options.txt                      |  2 +
 15 files changed, 124 insertions(+), 106 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 1/6] net/fm10k: add missing intrinsic include
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-06-20  7:24           ` Mattias Rönnblom
  2024-06-20  9:02             ` Bruce Richardson
                               ` (2 more replies)
  2024-06-20  7:24           ` [PATCH v2 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
                             ` (4 subsequent siblings)
  5 siblings, 3 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

Add missing <emmintrin.h> include, to get the _mm_cvtsi128_si64
prototype.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 2b6914b1da..d417b31bbb 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -10,6 +10,7 @@
 #include "base/fm10k_type.h"
 
 #include <tmmintrin.h>
+#include <emmintrin.h>
 
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 2/6] event/dlb2: include headers for vector and memory copy APIs
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
@ 2024-06-20  7:24           ` Mattias Rönnblom
  2024-06-20  9:03             ` Bruce Richardson
  2024-06-20  7:24           ` [PATCH v2 3/6] net/octeon_ep: properly include vector API header file Mattias Rönnblom
                             ` (3 subsequent siblings)
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
of <rte_memcpy.h> being included.

In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
but rather depended on other include files to do so.

This patch addresses both of those issues.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/event/dlb2/dlb2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..19f90b8f8d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -25,11 +25,13 @@
 #include <rte_kvargs.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_memcpy.h>
 #include <rte_mbuf.h>
 #include <rte_power_intrinsics.h>
 #include <rte_prefetch.h>
 #include <rte_ring.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "dlb2_priv.h"
 #include "dlb2_iface.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 3/6] net/octeon_ep: properly include vector API header file
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
  2024-06-20  7:24           ` [PATCH v2 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-20  7:24           ` Mattias Rönnblom
  2024-06-20 14:43             ` Stephen Hemminger
  2024-06-20  7:24           ` [PATCH v2 4/6] distributor: " Mattias Rönnblom
                             ` (2 subsequent siblings)
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 4/6] distributor: properly include vector API header file
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                             ` (2 preceding siblings ...)
  2024-06-20  7:24           ` [PATCH v2 3/6] net/octeon_ep: properly include vector API header file Mattias Rönnblom
@ 2024-06-20  7:24           ` Mattias Rönnblom
  2024-06-20  9:13             ` Bruce Richardson
  2024-06-20  7:24           ` [PATCH v2 5/6] fib: " Mattias Rönnblom
  2024-06-20  7:24           ` [PATCH v2 6/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 5/6] fib: properly include vector API header file
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                             ` (3 preceding siblings ...)
  2024-06-20  7:24           ` [PATCH v2 4/6] distributor: " Mattias Rönnblom
@ 2024-06-20  7:24           ` Mattias Rönnblom
  2024-06-20  9:14             ` Bruce Richardson
  2024-06-20  7:24           ` [PATCH v2 6/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v2 6/6] eal: provide option to use compiler memcpy instead of RTE
  2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                             ` (4 preceding siblings ...)
  2024-06-20  7:24           ` [PATCH v2 5/6] fib: " Mattias Rönnblom
@ 2024-06-20  7:24           ` Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20  7:24 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default,
the compiler/libc memcpy() is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

This patch leaves an option to stay on the custom DPDK implementations,
would that prove beneficial for certain applications or architectures.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>

---

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 10 files changed, 117 insertions(+), 106 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index a69f24cf99..4b6eafa86e 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -24,6 +24,27 @@ DPDK Release 24.07
 New Features
 ------------
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now delegates to the
+  standard memcpy() function, implemented by the compiler and the C
+  runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still available, and may be reactivated by
+  setting the new ``use_cc_memcpy`` build option to false.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of this change is that compilers and static
+  analysis tools have an easier time detecting incorrect usage of
+  rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
+
 .. This section should contain new features added in this release.
    Sample format:
 
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cae06117fb 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..344b4416b5 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,12 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..645fd83986 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,14 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +223,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..4acdc4af5f 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,12 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..06f544b631 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 1/6] net/fm10k: add missing intrinsic include
  2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
@ 2024-06-20  9:02             ` Bruce Richardson
  2024-06-20  9:28             ` Bruce Richardson
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2 siblings, 0 replies; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20  9:02 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, Jun 20, 2024 at 09:24:47AM +0200, Mattias Rönnblom wrote:
> Add missing <emmintrin.h> include, to get the _mm_cvtsi128_si64
> prototype.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 2/6] event/dlb2: include headers for vector and memory copy APIs
  2024-06-20  7:24           ` [PATCH v2 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-20  9:03             ` Bruce Richardson
  0 siblings, 0 replies; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20  9:03 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, Jun 20, 2024 at 09:24:48AM +0200, Mattias Rönnblom wrote:
> The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
> of <rte_memcpy.h> being included.
> 
> In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
> but rather depended on other include files to do so.
> 
> This patch addresses both of those issues.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 4/6] distributor: properly include vector API header file
  2024-06-20  7:24           ` [PATCH v2 4/6] distributor: " Mattias Rönnblom
@ 2024-06-20  9:13             ` Bruce Richardson
  0 siblings, 0 replies; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20  9:13 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, Jun 20, 2024 at 09:24:50AM +0200, Mattias Rönnblom wrote:
> The distributor library relied on <rte_vect.h>, but failed to provide
> a direct include of this file.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 5/6] fib: properly include vector API header file
  2024-06-20  7:24           ` [PATCH v2 5/6] fib: " Mattias Rönnblom
@ 2024-06-20  9:14             ` Bruce Richardson
  2024-06-20 14:43               ` Stephen Hemminger
  0 siblings, 1 reply; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20  9:14 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, Jun 20, 2024 at 09:24:51AM +0200, Mattias Rönnblom wrote:
> The trie implementation of the fib library relied on <rte_vect.h>, but
> failed to provide a direct include of this file.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 1/6] net/fm10k: add missing intrinsic include
  2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
  2024-06-20  9:02             ` Bruce Richardson
@ 2024-06-20  9:28             ` Bruce Richardson
  2024-06-20 11:40               ` Mattias Rönnblom
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2 siblings, 1 reply; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20  9:28 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, Jun 20, 2024 at 09:24:47AM +0200, Mattias Rönnblom wrote:
> Add missing <emmintrin.h> include, to get the _mm_cvtsi128_si64
> prototype.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  drivers/net/fm10k/fm10k_rxtx_vec.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
> index 2b6914b1da..d417b31bbb 100644
> --- a/drivers/net/fm10k/fm10k_rxtx_vec.c
> +++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
> @@ -10,6 +10,7 @@
>  #include "base/fm10k_type.h"
>  
>  #include <tmmintrin.h>
> +#include <emmintrin.h>
>  
Beyond my ack of this patch, a small suggestion is to just include
rte_vect.h rather than trying to include specific x86-intrinsics headers.

My ack remains with or without taking on board this suggestion.

/Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 1/6] net/fm10k: add missing intrinsic include
  2024-06-20  9:28             ` Bruce Richardson
@ 2024-06-20 11:40               ` Mattias Rönnblom
  2024-06-20 11:59                 ` Bruce Richardson
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:40 UTC (permalink / raw)
  To: Bruce Richardson, Mattias Rönnblom
  Cc: dev, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin

On 2024-06-20 11:28, Bruce Richardson wrote:
> On Thu, Jun 20, 2024 at 09:24:47AM +0200, Mattias Rönnblom wrote:
>> Add missing <emmintrin.h> include, to get the _mm_cvtsi128_si64
>> prototype.
>>
>> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> ---
>>   drivers/net/fm10k/fm10k_rxtx_vec.c | 1 +
>>   1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
>> index 2b6914b1da..d417b31bbb 100644
>> --- a/drivers/net/fm10k/fm10k_rxtx_vec.c
>> +++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
>> @@ -10,6 +10,7 @@
>>   #include "base/fm10k_type.h"
>>   
>>   #include <tmmintrin.h>
>> +#include <emmintrin.h>
>>   
> Beyond my ack of this patch, a small suggestion is to just include
> rte_vect.h rather than trying to include specific x86-intrinsics headers.
> 
> My ack remains with or without taking on board this suggestion.
> 
> /Bruce

I will do that, and hope it will magically solve the 
_mm_cvtsi128_si64-on-32-bit-x86 issue.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
  2024-06-20  9:02             ` Bruce Richardson
  2024-06-20  9:28             ` Bruce Richardson
@ 2024-06-20 11:50             ` Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 1/6] net/fm10k: add missing vector API header include Mattias Rönnblom
                                 ` (5 more replies)
  2 siblings, 6 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

This patch set make DPDK library, driver, and application code use the
compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
invoked.

The various custom DPDK rte_memcpy() implementations may be retained
by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

This patch set includes a number of fixes in drivers and libraries
which errornously relied on <rte_memcpy.h> including header files
(i.e., <rte_vect.h>) required by its implementation.

Mattias Rönnblom (6):
  net/fm10k: add missing vector API header include
  event/dlb2: include headers for vector and memory copy APIs
  net/octeon_ep: add missing vector API header include
  distributor: add missing vector API header include
  fib: add missing vector API header include
  eal: provide option to use compiler memcpy instead of RTE

 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 drivers/event/dlb2/dlb2.c              |  2 +
 drivers/net/fm10k/fm10k_rxtx_vec.c     |  1 +
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  2 +
 lib/distributor/rte_distributor.c      |  1 +
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 lib/fib/trie.c                         |  1 +
 meson_options.txt                      |  2 +
 15 files changed, 124 insertions(+), 106 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 1/6] net/fm10k: add missing vector API header include
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-06-20 11:50               ` Mattias Rönnblom
  2024-06-20 12:34                 ` Bruce Richardson
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
                                 ` (4 subsequent siblings)
  5 siblings, 2 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The fm10k PMD relied on <rte_vect.h>, but failed to provide a direct
include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 2b6914b1da..62119de373 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -6,6 +6,7 @@
 
 #include <ethdev_driver.h>
 #include <rte_common.h>
+#include <rte_vect.h>
 #include "fm10k.h"
 #include "base/fm10k_type.h"
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 2/6] event/dlb2: include headers for vector and memory copy APIs
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 1/6] net/fm10k: add missing vector API header include Mattias Rönnblom
@ 2024-06-20 11:50               ` Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 3/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
                                 ` (3 subsequent siblings)
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
of <rte_memcpy.h> being included.

In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
but rather depended on other include files to do so.

This patch addresses both of those issues.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/event/dlb2/dlb2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..19f90b8f8d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -25,11 +25,13 @@
 #include <rte_kvargs.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_memcpy.h>
 #include <rte_mbuf.h>
 #include <rte_power_intrinsics.h>
 #include <rte_prefetch.h>
 #include <rte_ring.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "dlb2_priv.h"
 #include "dlb2_iface.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 3/6] net/octeon_ep: add missing vector API header include
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 1/6] net/fm10k: add missing vector API header include Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-20 11:50               ` Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 4/6] distributor: " Mattias Rönnblom
                                 ` (2 subsequent siblings)
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 4/6] distributor: add missing vector API header include
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                 ` (2 preceding siblings ...)
  2024-06-20 11:50               ` [PATCH v3 3/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
@ 2024-06-20 11:50               ` Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 5/6] fib: " Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 6/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 5/6] fib: add missing vector API header include
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                 ` (3 preceding siblings ...)
  2024-06-20 11:50               ` [PATCH v3 4/6] distributor: " Mattias Rönnblom
@ 2024-06-20 11:50               ` Mattias Rönnblom
  2024-06-20 11:50               ` [PATCH v3 6/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v3 6/6] eal: provide option to use compiler memcpy instead of RTE
  2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                 ` (4 preceding siblings ...)
  2024-06-20 11:50               ` [PATCH v3 5/6] fib: " Mattias Rönnblom
@ 2024-06-20 11:50               ` Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 11:50 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default,
the compiler/libc memcpy() is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

This patch leaves an option to stay on the custom DPDK implementations,
would that prove beneficial for certain applications or architectures.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>

---

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 10 files changed, 117 insertions(+), 106 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 7c88de381b..ebe0085d8b 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -24,6 +24,27 @@ DPDK Release 24.07
 New Features
 ------------
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now delegates to the
+  standard memcpy() function, implemented by the compiler and the C
+  runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still available, and may be reactivated by
+  setting the new ``use_cc_memcpy`` build option to false.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of this change is that compilers and static
+  analysis tools have an easier time detecting incorrect usage of
+  rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
+
 .. This section should contain new features added in this release.
    Sample format:
 
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cae06117fb 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..344b4416b5 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,12 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..645fd83986 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,14 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +223,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..4acdc4af5f 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,12 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..06f544b631 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 1/6] net/fm10k: add missing intrinsic include
  2024-06-20 11:40               ` Mattias Rönnblom
@ 2024-06-20 11:59                 ` Bruce Richardson
  0 siblings, 0 replies; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20 11:59 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin

On Thu, Jun 20, 2024 at 01:40:42PM +0200, Mattias Rönnblom wrote:
> On 2024-06-20 11:28, Bruce Richardson wrote:
> > On Thu, Jun 20, 2024 at 09:24:47AM +0200, Mattias Rönnblom wrote:
> > > Add missing <emmintrin.h> include, to get the _mm_cvtsi128_si64
> > > prototype.
> > > 
> > > Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> > > ---
> > >   drivers/net/fm10k/fm10k_rxtx_vec.c | 1 +
> > >   1 file changed, 1 insertion(+)
> > > 
> > > diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
> > > index 2b6914b1da..d417b31bbb 100644
> > > --- a/drivers/net/fm10k/fm10k_rxtx_vec.c
> > > +++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
> > > @@ -10,6 +10,7 @@
> > >   #include "base/fm10k_type.h"
> > >   #include <tmmintrin.h>
> > > +#include <emmintrin.h>
> > Beyond my ack of this patch, a small suggestion is to just include
> > rte_vect.h rather than trying to include specific x86-intrinsics headers.
> > 
> > My ack remains with or without taking on board this suggestion.
> > 
> > /Bruce
> 
> I will do that, and hope it will magically solve the
> _mm_cvtsi128_si64-on-32-bit-x86 issue.

I was looking at that, and it does solve it in my testing. There are a lot
of drivers that have just "tmmintrin.h" included. Changing all of those to
rte_vect.h allows 32bit to build with your other changes applied.

/Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v3 1/6] net/fm10k: add missing vector API header include
  2024-06-20 11:50               ` [PATCH v3 1/6] net/fm10k: add missing vector API header include Mattias Rönnblom
@ 2024-06-20 12:34                 ` Bruce Richardson
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  1 sibling, 0 replies; 128+ messages in thread
From: Bruce Richardson @ 2024-06-20 12:34 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, Jun 20, 2024 at 01:50:22PM +0200, Mattias Rönnblom wrote:
> The fm10k PMD relied on <rte_vect.h>, but failed to provide a direct
> include of this file.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  drivers/net/fm10k/fm10k_rxtx_vec.c | 1 +
>  1 file changed, 1 insertion(+)
> 
To fix 32-bit builds, more than just this driver needs to be fixed. See
https://patches.dpdk.org/project/dpdk/patch/20240620123218.1936250-1-bruce.richardson@intel.com/

Feel free to include this patch in new revisions of your patchset, if it
simplifies things for you.

/Bruce

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 5/6] fib: properly include vector API header file
  2024-06-20  9:14             ` Bruce Richardson
@ 2024-06-20 14:43               ` Stephen Hemminger
  0 siblings, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-20 14:43 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Mattias Rönnblom, dev, hofors, Morten Brørup,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin

On Thu, 20 Jun 2024 10:14:18 +0100
Bruce Richardson <bruce.richardson@intel.com> wrote:

> On Thu, Jun 20, 2024 at 09:24:51AM +0200, Mattias Rönnblom wrote:
> > The trie implementation of the fib library relied on <rte_vect.h>, but
> > failed to provide a direct include of this file.
> > 
> > Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> > ---  
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Acked-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v2 3/6] net/octeon_ep: properly include vector API header file
  2024-06-20  7:24           ` [PATCH v2 3/6] net/octeon_ep: properly include vector API header file Mattias Rönnblom
@ 2024-06-20 14:43             ` Stephen Hemminger
  0 siblings, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-20 14:43 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin

On Thu, 20 Jun 2024 09:24:49 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:

> The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
> direct include of this file.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---

Acked-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-20 11:50               ` [PATCH v3 1/6] net/fm10k: add missing vector API header include Mattias Rönnblom
  2024-06-20 12:34                 ` Bruce Richardson
@ 2024-06-20 17:57                 ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 01/13] net/i40e: add missing vector API header include Mattias Rönnblom
                                     ` (15 more replies)
  1 sibling, 16 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

This patch set make DPDK library, driver, and application code use the
compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
invoked.

The various custom DPDK rte_memcpy() implementations may be retained
by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

This patch set includes a number of fixes in drivers and libraries
which errornously relied on <rte_memcpy.h> including header files
(i.e., <rte_vect.h>) required by its implementation.

Mattias Rönnblom (13):
  net/i40e: add missing vector API header include
  net/iavf: add missing vector API header include
  net/ice: add missing vector API header include
  net/ixgbe: add missing vector API header include
  net/ngbe: add missing vector API header include
  net/txgbe: add missing vector API header include
  net/virtio: add missing vector API header include
  net/fm10k: add missing vector API header include
  event/dlb2: include headers for vector and memory copy APIs
  net/octeon_ep: add missing vector API header include
  distributor: add missing vector API header include
  fib: add missing vector API header include
  eal: provide option to use compiler memcpy instead of RTE

 config/meson.build                          |  1 +
 doc/guides/rel_notes/release_24_07.rst      | 21 +++++++
 drivers/event/dlb2/dlb2.c                   |  2 +
 drivers/net/fm10k/fm10k_rxtx_vec.c          |  3 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c        |  3 +-
 drivers/net/iavf/iavf_rxtx_vec_sse.c        |  3 +-
 drivers/net/ice/ice_rxtx_vec_sse.c          |  2 +-
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c      |  3 +-
 drivers/net/ngbe/ngbe_rxtx_vec_sse.c        |  3 +-
 drivers/net/octeon_ep/otx_ep_ethdev.c       |  2 +
 drivers/net/txgbe/txgbe_rxtx_vec_sse.c      |  3 +-
 drivers/net/virtio/virtio_rxtx_simple_sse.c |  3 +-
 lib/distributor/rte_distributor.c           |  1 +
 lib/eal/arm/include/rte_memcpy.h            | 10 ++++
 lib/eal/include/generic/rte_memcpy.h        | 61 ++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h      | 53 ++----------------
 lib/eal/ppc/include/rte_memcpy.h            | 10 ++++
 lib/eal/riscv/include/rte_memcpy.h          | 53 ++----------------
 lib/eal/x86/include/meson.build             |  1 +
 lib/eal/x86/include/rte_memcpy.h            | 11 +++-
 lib/fib/trie.c                              |  1 +
 meson_options.txt                           |  2 +
 22 files changed, 131 insertions(+), 121 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 01/13] net/i40e: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 02/13] net/iavf: add missing vector API header include Mattias Rönnblom
                                     ` (14 subsequent siblings)
  15 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The i40e driver relied on <rte_vect.h>, but failed to provide a direct
include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/i40e/i40e_rxtx_vec_sse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 2d4480a765..0a0448544f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include <ethdev_driver.h>
 #include <rte_malloc.h>
+#include <rte_vect.h>
 
 #include "base/i40e_prototype.h"
 #include "base/i40e_type.h"
@@ -12,8 +13,6 @@
 #include "i40e_rxtx.h"
 #include "i40e_rxtx_vec_common.h"
 
-#include <tmmintrin.h>
-
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 02/13] net/iavf: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 01/13] net/i40e: add missing vector API header include Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 03/13] net/ice: " Mattias Rönnblom
                                     ` (13 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The iavf driver relied on <rte_vect.h>, but failed to provide a direct
include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/iavf/iavf_rxtx_vec_sse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 96f187f511..75270876c1 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -5,13 +5,12 @@
 #include <stdint.h>
 #include <ethdev_driver.h>
 #include <rte_malloc.h>
+#include <rte_vect.h>
 
 #include "iavf.h"
 #include "iavf_rxtx.h"
 #include "iavf_rxtx_vec_common.h"
 
-#include <tmmintrin.h>
-
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 03/13] net/ice: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 01/13] net/i40e: add missing vector API header include Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 02/13] net/iavf: add missing vector API header include Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 04/13] net/ixgbe: " Mattias Rönnblom
                                     ` (12 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The ice driver relied on <rte_vect.h>, but failed to provide a direct
include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/ice/ice_rxtx_vec_sse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 9a1b7e3e51..c01d8ede29 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -4,7 +4,7 @@
 
 #include "ice_rxtx_vec_common.h"
 
-#include <tmmintrin.h>
+#include <rte_vect.h>
 
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 04/13] net/ixgbe: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (2 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 03/13] net/ice: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 05/13] net/ngbe: " Mattias Rönnblom
                                     ` (11 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The ixgbe driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index f60808d576..0f93f58745 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -5,13 +5,12 @@
 #include <stdint.h>
 #include <ethdev_driver.h>
 #include <rte_malloc.h>
+#include <rte_vect.h>
 
 #include "ixgbe_ethdev.h"
 #include "ixgbe_rxtx.h"
 #include "ixgbe_rxtx_vec_common.h"
 
-#include <tmmintrin.h>
-
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 05/13] net/ngbe: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (3 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 04/13] net/ixgbe: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 06/13] net/txgbe: " Mattias Rönnblom
                                     ` (10 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The ngbe driver relied on <rte_vect.h>, but failed to provide a direct
include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/ngbe/ngbe_rxtx_vec_sse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ngbe/ngbe_rxtx_vec_sse.c b/drivers/net/ngbe/ngbe_rxtx_vec_sse.c
index f703d0ea15..80d0bedcdd 100644
--- a/drivers/net/ngbe/ngbe_rxtx_vec_sse.c
+++ b/drivers/net/ngbe/ngbe_rxtx_vec_sse.c
@@ -5,14 +5,13 @@
 
 #include <ethdev_driver.h>
 #include <rte_malloc.h>
+#include <rte_vect.h>
 
 #include "ngbe_type.h"
 #include "ngbe_ethdev.h"
 #include "ngbe_rxtx.h"
 #include "ngbe_rxtx_vec_common.h"
 
-#include <tmmintrin.h>
-
 static inline void
 ngbe_rxq_rearm(struct ngbe_rx_queue *rxq)
 {
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 06/13] net/txgbe: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (4 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 05/13] net/ngbe: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 07/13] net/virtio: " Mattias Rönnblom
                                     ` (9 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The txgbe driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/txgbe/txgbe_rxtx_vec_sse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/txgbe/txgbe_rxtx_vec_sse.c b/drivers/net/txgbe/txgbe_rxtx_vec_sse.c
index 12eb4aeef5..d5f60ec92e 100644
--- a/drivers/net/txgbe/txgbe_rxtx_vec_sse.c
+++ b/drivers/net/txgbe/txgbe_rxtx_vec_sse.c
@@ -5,13 +5,12 @@
 
 #include <ethdev_driver.h>
 #include <rte_malloc.h>
+#include <rte_vect.h>
 
 #include "txgbe_ethdev.h"
 #include "txgbe_rxtx.h"
 #include "txgbe_rxtx_vec_common.h"
 
-#include <tmmintrin.h>
-
 static inline void
 txgbe_rxq_rearm(struct txgbe_rx_queue *rxq)
 {
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 07/13] net/virtio: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (5 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 06/13] net/txgbe: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 08/13] net/fm10k: " Mattias Rönnblom
                                     ` (8 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The virtio driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 drivers/net/virtio/virtio_rxtx_simple_sse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
index 6a18741b6d..db84a308e4 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_sse.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -8,8 +8,6 @@
 #include <string.h>
 #include <errno.h>
 
-#include <tmmintrin.h>
-
 #include <rte_byteorder.h>
 #include <rte_branch_prediction.h>
 #include <rte_cycles.h>
@@ -22,6 +20,7 @@
 #include <rte_mbuf.h>
 #include <rte_prefetch.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "virtio_rxtx_simple.h"
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 08/13] net/fm10k: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (6 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 07/13] net/virtio: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 09/13] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
                                     ` (7 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The fm10k PMD relied on <rte_vect.h>, but failed to provide a direct
include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 2b6914b1da..6be8822284 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -6,11 +6,10 @@
 
 #include <ethdev_driver.h>
 #include <rte_common.h>
+#include <rte_vect.h>
 #include "fm10k.h"
 #include "base/fm10k_type.h"
 
-#include <tmmintrin.h>
-
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 09/13] event/dlb2: include headers for vector and memory copy APIs
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (7 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 08/13] net/fm10k: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 10/13] net/octeon_ep: add missing vector API header include Mattias Rönnblom
                                     ` (6 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
of <rte_memcpy.h> being included.

In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
but rather depended on other include files to do so.

This patch addresses both of those issues.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/event/dlb2/dlb2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..19f90b8f8d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -25,11 +25,13 @@
 #include <rte_kvargs.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_memcpy.h>
 #include <rte_mbuf.h>
 #include <rte_power_intrinsics.h>
 #include <rte_prefetch.h>
 #include <rte_ring.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "dlb2_priv.h"
 #include "dlb2_iface.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 10/13] net/octeon_ep: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (8 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 09/13] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 11/13] distributor: " Mattias Rönnblom
                                     ` (5 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 11/13] distributor: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (9 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 10/13] net/octeon_ep: add missing vector API header include Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 12/13] fib: " Mattias Rönnblom
                                     ` (4 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 12/13] fib: add missing vector API header include
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (10 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 11/13] distributor: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-20 17:57                   ` [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
                                     ` (3 subsequent siblings)
  15 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (11 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 12/13] fib: " Mattias Rönnblom
@ 2024-06-20 17:57                   ` Mattias Rönnblom
  2024-06-21 15:19                     ` Stephen Hemminger
  2024-06-24 10:05                     ` Thomas Monjalon
  2024-06-20 18:53                   ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Morten Brørup
                                     ` (2 subsequent siblings)
  15 siblings, 2 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-20 17:57 UTC (permalink / raw)
  To: dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson, Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default,
the compiler/libc memcpy() is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

This patch leaves an option to stay on the custom DPDK implementations,
would that prove beneficial for certain applications or architectures.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>

---

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 10 files changed, 117 insertions(+), 106 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 7c88de381b..ebe0085d8b 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -24,6 +24,27 @@ DPDK Release 24.07
 New Features
 ------------
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now delegates to the
+  standard memcpy() function, implemented by the compiler and the C
+  runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still available, and may be reactivated by
+  setting the new ``use_cc_memcpy`` build option to false.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of this change is that compilers and static
+  analysis tools have an easier time detecting incorrect usage of
+  rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
+
 .. This section should contain new features added in this release.
    Sample format:
 
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cae06117fb 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..344b4416b5 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,12 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..645fd83986 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,14 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +223,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..4acdc4af5f 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,12 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..06f544b631 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (12 preceding siblings ...)
  2024-06-20 17:57                   ` [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-06-20 18:53                   ` Morten Brørup
  2024-06-21  6:56                   ` Mattias Rönnblom
  2024-06-25 15:29                   ` Maxime Coquelin
  15 siblings, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-06-20 18:53 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: hofors, Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
> 
> This patch set make DPDK library, driver, and application code use the
> compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
> invoked.
> 
> The various custom DPDK rte_memcpy() implementations may be retained
> by means of a build-time option.
> 
> This patch set only make a difference on x86, PPC and ARM. Loongarch
> and RISCV already used compiler/libc memcpy().
> 
> This patch set includes a number of fixes in drivers and libraries
> which errornously relied on <rte_memcpy.h> including header files
> (i.e., <rte_vect.h>) required by its implementation.

Series-acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (13 preceding siblings ...)
  2024-06-20 18:53                   ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Morten Brørup
@ 2024-06-21  6:56                   ` Mattias Rönnblom
  2024-06-21  7:04                     ` David Marchand
  2024-06-25 15:29                   ` Maxime Coquelin
  15 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-21  6:56 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson

It seems like patchwork didn't realize this v4 patch set supersedes v3.

I'm trying to figure out what I did wrong. Must you use the cover 
letter's message id in order for patchwork to recognize the history?

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-21  6:56                   ` Mattias Rönnblom
@ 2024-06-21  7:04                     ` David Marchand
  2024-06-21  7:35                       ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: David Marchand @ 2024-06-21  7:04 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

On Fri, Jun 21, 2024 at 8:57 AM Mattias Rönnblom <hofors@lysator.liu.se> wrote:
>
> It seems like patchwork didn't realize this v4 patch set supersedes v3.

There is nothing automatic in patchwork.
Author is responsible for marking as superseded in patchwork.

I usually go around in patchwork in the morning and mark superseded
stuff on my own.
So please wait for the robot David to do the job... :-).

>
> I'm trying to figure out what I did wrong. Must you use the cover
> letter's message id in order for patchwork to recognize the history?
>


-- 
David Marchand


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-21  7:04                     ` David Marchand
@ 2024-06-21  7:35                       ` Mattias Rönnblom
  2024-06-21  7:41                         ` David Marchand
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-21  7:35 UTC (permalink / raw)
  To: David Marchand
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

On 2024-06-21 09:04, David Marchand wrote:
> On Fri, Jun 21, 2024 at 8:57 AM Mattias Rönnblom <hofors@lysator.liu.se> wrote:
>>
>> It seems like patchwork didn't realize this v4 patch set supersedes v3.
> 
> There is nothing automatic in patchwork.

Oh, I see. :)

> Author is responsible for marking as superseded in patchwork.
> 
> I usually go around in patchwork in the morning and mark superseded
> stuff on my own.
> So please wait for the robot David to do the job... :-).
> 

Hmm. Author is responsible, but robot David does the actual work?

Do you prefer authors doing the work?

>>
>> I'm trying to figure out what I did wrong. Must you use the cover
>> letter's message id in order for patchwork to recognize the history?
>>
> 
> 

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-21  7:35                       ` Mattias Rönnblom
@ 2024-06-21  7:41                         ` David Marchand
  0 siblings, 0 replies; 128+ messages in thread
From: David Marchand @ 2024-06-21  7:41 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

On Fri, Jun 21, 2024 at 9:36 AM Mattias Rönnblom <hofors@lysator.liu.se> wrote:
>
> On 2024-06-21 09:04, David Marchand wrote:
> > On Fri, Jun 21, 2024 at 8:57 AM Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> >>
> >> It seems like patchwork didn't realize this v4 patch set supersedes v3.
> >
> > There is nothing automatic in patchwork.
>
> Oh, I see. :)
>
> > Author is responsible for marking as superseded in patchwork.
> >
> > I usually go around in patchwork in the morning and mark superseded
> > stuff on my own.
> > So please wait for the robot David to do the job... :-).
> >
>
> Hmm. Author is responsible, but robot David does the actual work?

I am pretty sure other subtree maintainers do the same for their part
of patchwork.

>
> Do you prefer authors doing the work?

Unfortunately robot David can not read in people's mind (an AI would
be most likely better at it but I did not invest time in it).
And I also do some errors at this job because it involves clicking in
a webui (not often, but it does happen).

So yes, I prefer people do this part of the contribution job.
https://doc.dpdk.org/guides/contributing/patches.html#steps-to-getting-your-patch-merged


-- 
David Marchand


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE
  2024-06-20 17:57                   ` [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-06-21 15:19                     ` Stephen Hemminger
  2024-06-24 10:05                     ` Thomas Monjalon
  1 sibling, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-21 15:19 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson

On Thu, 20 Jun 2024 19:57:31 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:

> Provide build option to have functions in <rte_memcpy.h> delegate to
> the standard compiler/libc memcpy(), instead of using the various
> custom DPDK, handcrafted, per-architecture rte_memcpy()
> implementations.
> 
> A new meson build option 'use_cc_memcpy' is added. By default,
> the compiler/libc memcpy() is used.
> 
> The performance benefits of the custom DPDK rte_memcpy()
> implementations have been diminishing with every compiler release, and
> with current toolchains the use of a custom memcpy() implementation
> may even be a liability.
> 
> This patch leaves an option to stay on the custom DPDK implementations,
> would that prove beneficial for certain applications or architectures.
> 
> An additional benefit of this change is that compilers and static
> analysis tools have an easier time detecting incorrect usage of
> rte_memcpy() (e.g., buffer overruns, or overlapping source and
> destination buffers).
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>

Would like to mark rte_memcpy as deprecated in a future release.

Acked-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE
  2024-06-20 17:57                   ` [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
  2024-06-21 15:19                     ` Stephen Hemminger
@ 2024-06-24 10:05                     ` Thomas Monjalon
  2024-06-24 17:56                       ` Mattias Rönnblom
  2024-06-25 13:06                       ` Mattias Rönnblom
  1 sibling, 2 replies; 128+ messages in thread
From: Thomas Monjalon @ 2024-06-24 10:05 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, hofors, Morten Brørup, Stephen Hemminger,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin, Bruce Richardson, anatoly.burakov,
	david.marchand, maxime.coquelin

20/06/2024 19:57, Mattias Rönnblom:
> Provide build option to have functions in <rte_memcpy.h> delegate to
> the standard compiler/libc memcpy(), instead of using the various
> custom DPDK, handcrafted, per-architecture rte_memcpy()
> implementations.
> 
> A new meson build option 'use_cc_memcpy' is added. By default,
> the compiler/libc memcpy() is used.
> 
> The performance benefits of the custom DPDK rte_memcpy()
> implementations have been diminishing with every compiler release, and
> with current toolchains the use of a custom memcpy() implementation
> may even be a liability.
> 
> This patch leaves an option to stay on the custom DPDK implementations,
> would that prove beneficial for certain applications or architectures.
[...]
> --- a/meson_options.txt
> +++ b/meson_options.txt
> +option('use_cc_memcpy', type: 'boolean', value: true, description:
> +       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')

I suppose you propose this change for 24.11?
I would prefer getting the option disabled in 24.07,
so we can run tests during months before enabling it by default.
This period would also help to make sure it is compiling in all cases.
Please could you enable the option in our compilation scripts?



^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE
  2024-06-24 10:05                     ` Thomas Monjalon
@ 2024-06-24 17:56                       ` Mattias Rönnblom
  2024-06-25 13:06                       ` Mattias Rönnblom
  1 sibling, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-24 17:56 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson,
	anatoly.burakov, david.marchand, maxime.coquelin

On Mon, Jun 24, 2024 at 12:05:53PM +0200, Thomas Monjalon wrote:
> 20/06/2024 19:57, Mattias Rönnblom:
> > Provide build option to have functions in <rte_memcpy.h> delegate to
> > the standard compiler/libc memcpy(), instead of using the various
> > custom DPDK, handcrafted, per-architecture rte_memcpy()
> > implementations.
> > 
> > A new meson build option 'use_cc_memcpy' is added. By default,
> > the compiler/libc memcpy() is used.
> > 
> > The performance benefits of the custom DPDK rte_memcpy()
> > implementations have been diminishing with every compiler release, and
> > with current toolchains the use of a custom memcpy() implementation
> > may even be a liability.
> > 
> > This patch leaves an option to stay on the custom DPDK implementations,
> > would that prove beneficial for certain applications or architectures.
> [...]
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > +option('use_cc_memcpy', type: 'boolean', value: true, description:
> > +       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
> 
> I suppose you propose this change for 24.11?
> I would prefer getting the option disabled in 24.07,
> so we can run tests during months before enabling it by default.

I think I would suggest having it included and enabled by default in
24.07, but that is too risky perhaps.

Having it included in 24.07 and disabled by default would be the
conservative option.

> This period would also help to make sure it is compiling in all cases.
> Please could you enable the option in our compilation scripts?
> 
>

I'll give it a try.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE
  2024-06-24 10:05                     ` Thomas Monjalon
  2024-06-24 17:56                       ` Mattias Rönnblom
@ 2024-06-25 13:06                       ` Mattias Rönnblom
  2024-06-25 13:34                         ` Thomas Monjalon
  1 sibling, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-25 13:06 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson,
	anatoly.burakov, david.marchand, maxime.coquelin

On Mon, Jun 24, 2024 at 12:05:53PM +0200, Thomas Monjalon wrote:

<snip>

> Please could you enable the option in our compilation scripts?
>

"Compilation scripts" is test-meson-builds.sh?
Should it be done in the same as is being done for stdatomic?
Wouldn't it be enough if this is done in CI only?

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE
  2024-06-25 13:06                       ` Mattias Rönnblom
@ 2024-06-25 13:34                         ` Thomas Monjalon
  0 siblings, 0 replies; 128+ messages in thread
From: Thomas Monjalon @ 2024-06-25 13:34 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson,
	anatoly.burakov, david.marchand, maxime.coquelin

25/06/2024 15:06, Mattias Rönnblom:
> On Mon, Jun 24, 2024 at 12:05:53PM +0200, Thomas Monjalon wrote:
> 
> <snip>
> 
> > Please could you enable the option in our compilation scripts?
> >
> 
> "Compilation scripts" is test-meson-builds.sh?
> Should it be done in the same as is being done for stdatomic?
> Wouldn't it be enough if this is done in CI only?

Should be the same as for stdatomic.
For stdatomic, there are changes in these 3 files:
	- devtools/test-meson-builds.sh
	- .ci/linux-build.sh
	- .github/workflows/build.yml




^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                     ` (14 preceding siblings ...)
  2024-06-21  6:56                   ` Mattias Rönnblom
@ 2024-06-25 15:29                   ` Maxime Coquelin
  2024-06-25 15:44                     ` Stephen Hemminger
  2024-06-25 19:27                     ` Mattias Rönnblom
  15 siblings, 2 replies; 128+ messages in thread
From: Maxime Coquelin @ 2024-06-25 15:29 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: hofors, Morten Brørup, Stephen Hemminger, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson

Hi Mattias,

On 6/20/24 19:57, Mattias Rönnblom wrote:
> This patch set make DPDK library, driver, and application code use the
> compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
> invoked.
> 
> The various custom DPDK rte_memcpy() implementations may be retained
> by means of a build-time option.
> 
> This patch set only make a difference on x86, PPC and ARM. Loongarch
> and RISCV already used compiler/libc memcpy().

It indeed makes a difference on x86!

Just tested latest main with and without your series on
Intel(R) Xeon(R) Gold 6438N.

The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
# dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 
'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1' 
   --single-file-segments -- -i
testpmd> start

# dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev 
'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i
testpmd> start tx_first 32

Latest main: 14.5Mpps
Latest main + this series: 10Mpps

So for me, it should be disabled by default.

Regards,
Maxime

> This patch set includes a number of fixes in drivers and libraries
> which errornously relied on <rte_memcpy.h> including header files
> (i.e., <rte_vect.h>) required by its implementation.
> 
> Mattias Rönnblom (13):
>    net/i40e: add missing vector API header include
>    net/iavf: add missing vector API header include
>    net/ice: add missing vector API header include
>    net/ixgbe: add missing vector API header include
>    net/ngbe: add missing vector API header include
>    net/txgbe: add missing vector API header include
>    net/virtio: add missing vector API header include
>    net/fm10k: add missing vector API header include
>    event/dlb2: include headers for vector and memory copy APIs
>    net/octeon_ep: add missing vector API header include
>    distributor: add missing vector API header include
>    fib: add missing vector API header include
>    eal: provide option to use compiler memcpy instead of RTE
> 
>   config/meson.build                          |  1 +
>   doc/guides/rel_notes/release_24_07.rst      | 21 +++++++
>   drivers/event/dlb2/dlb2.c                   |  2 +
>   drivers/net/fm10k/fm10k_rxtx_vec.c          |  3 +-
>   drivers/net/i40e/i40e_rxtx_vec_sse.c        |  3 +-
>   drivers/net/iavf/iavf_rxtx_vec_sse.c        |  3 +-
>   drivers/net/ice/ice_rxtx_vec_sse.c          |  2 +-
>   drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c      |  3 +-
>   drivers/net/ngbe/ngbe_rxtx_vec_sse.c        |  3 +-
>   drivers/net/octeon_ep/otx_ep_ethdev.c       |  2 +
>   drivers/net/txgbe/txgbe_rxtx_vec_sse.c      |  3 +-
>   drivers/net/virtio/virtio_rxtx_simple_sse.c |  3 +-
>   lib/distributor/rte_distributor.c           |  1 +
>   lib/eal/arm/include/rte_memcpy.h            | 10 ++++
>   lib/eal/include/generic/rte_memcpy.h        | 61 ++++++++++++++++++---
>   lib/eal/loongarch/include/rte_memcpy.h      | 53 ++----------------
>   lib/eal/ppc/include/rte_memcpy.h            | 10 ++++
>   lib/eal/riscv/include/rte_memcpy.h          | 53 ++----------------
>   lib/eal/x86/include/meson.build             |  1 +
>   lib/eal/x86/include/rte_memcpy.h            | 11 +++-
>   lib/fib/trie.c                              |  1 +
>   meson_options.txt                           |  2 +
>   22 files changed, 131 insertions(+), 121 deletions(-)
> 


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-25 15:29                   ` Maxime Coquelin
@ 2024-06-25 15:44                     ` Stephen Hemminger
  2024-06-25 19:27                     ` Mattias Rönnblom
  1 sibling, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-25 15:44 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Mattias Rönnblom, dev, hofors, Morten Brørup,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin, Bruce Richardson

On Tue, 25 Jun 2024 17:29:35 +0200
Maxime Coquelin <maxime.coquelin@redhat.com> wrote:

> Hi Mattias,
> 
> On 6/20/24 19:57, Mattias Rönnblom wrote:
> > This patch set make DPDK library, driver, and application code use the
> > compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
> > invoked.
> > 
> > The various custom DPDK rte_memcpy() implementations may be retained
> > by means of a build-time option.
> > 
> > This patch set only make a difference on x86, PPC and ARM. Loongarch
> > and RISCV already used compiler/libc memcpy().  
> 
> It indeed makes a difference on x86!
> 
> Just tested latest main with and without your series on
> Intel(R) Xeon(R) Gold 6438N.
> 
> The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
> # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 
> 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1' 
>    --single-file-segments -- -i
> testpmd> start  
> 
> # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev 
> 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i
> testpmd> start tx_first 32  
> 
> Latest main: 14.5Mpps
> Latest main + this series: 10Mpps
> 
> So for me, it should be disabled by default.
> 
> Regards,
> Maxime

What is the size of the copy being done?
Which compiler version?

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-25 15:29                   ` Maxime Coquelin
  2024-06-25 15:44                     ` Stephen Hemminger
@ 2024-06-25 19:27                     ` Mattias Rönnblom
  2024-06-26  8:37                       ` Maxime Coquelin
  1 sibling, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-25 19:27 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:
> Hi Mattias,
> 
> On 6/20/24 19:57, Mattias Rönnblom wrote:
> > This patch set make DPDK library, driver, and application code use the
> > compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
> > invoked.
> > 
> > The various custom DPDK rte_memcpy() implementations may be retained
> > by means of a build-time option.
> > 
> > This patch set only make a difference on x86, PPC and ARM. Loongarch
> > and RISCV already used compiler/libc memcpy().
> 
> It indeed makes a difference on x86!
>
> Just tested latest main with and without your series on
> Intel(R) Xeon(R) Gold 6438N.
> 
> The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
> # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1'
> --single-file-segments -- -i
> testpmd> start
> 
> # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i
> testpmd> start tx_first 32
> 
> Latest main: 14.5Mpps
> Latest main + this series: 10Mpps
>

I ran the above benchmark on my Raptor Lake desktop (locked to 3,2
GHz). GCC 12.3.0.

Core use_cc_memcpy Mpps
E    false         9.5
E    true          9.7
P    false         16.4
P    true          13.5

On the P-cores, there's a significant performance regression, although
not as bad as the one you see on your Sapphire Rapids Xeon. On the
E-cores, there's actually a slight performance gain.

The virtio PMD does not directly invoke rte_memcpy() or anything else
from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
understand what's going on here. Does the virtio driver delegate some
performance-critical task to some module that in turns uses
rte_memcpy()?

> So for me, it should be disabled by default.
> 
> Regards,
> Maxime
> 
> > This patch set includes a number of fixes in drivers and libraries
> > which errornously relied on <rte_memcpy.h> including header files
> > (i.e., <rte_vect.h>) required by its implementation.
> > 
> > Mattias Rönnblom (13):
> >    net/i40e: add missing vector API header include
> >    net/iavf: add missing vector API header include
> >    net/ice: add missing vector API header include
> >    net/ixgbe: add missing vector API header include
> >    net/ngbe: add missing vector API header include
> >    net/txgbe: add missing vector API header include
> >    net/virtio: add missing vector API header include
> >    net/fm10k: add missing vector API header include
> >    event/dlb2: include headers for vector and memory copy APIs
> >    net/octeon_ep: add missing vector API header include
> >    distributor: add missing vector API header include
> >    fib: add missing vector API header include
> >    eal: provide option to use compiler memcpy instead of RTE
> > 
> >   config/meson.build                          |  1 +
> >   doc/guides/rel_notes/release_24_07.rst      | 21 +++++++
> >   drivers/event/dlb2/dlb2.c                   |  2 +
> >   drivers/net/fm10k/fm10k_rxtx_vec.c          |  3 +-
> >   drivers/net/i40e/i40e_rxtx_vec_sse.c        |  3 +-
> >   drivers/net/iavf/iavf_rxtx_vec_sse.c        |  3 +-
> >   drivers/net/ice/ice_rxtx_vec_sse.c          |  2 +-
> >   drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c      |  3 +-
> >   drivers/net/ngbe/ngbe_rxtx_vec_sse.c        |  3 +-
> >   drivers/net/octeon_ep/otx_ep_ethdev.c       |  2 +
> >   drivers/net/txgbe/txgbe_rxtx_vec_sse.c      |  3 +-
> >   drivers/net/virtio/virtio_rxtx_simple_sse.c |  3 +-
> >   lib/distributor/rte_distributor.c           |  1 +
> >   lib/eal/arm/include/rte_memcpy.h            | 10 ++++
> >   lib/eal/include/generic/rte_memcpy.h        | 61 ++++++++++++++++++---
> >   lib/eal/loongarch/include/rte_memcpy.h      | 53 ++----------------
> >   lib/eal/ppc/include/rte_memcpy.h            | 10 ++++
> >   lib/eal/riscv/include/rte_memcpy.h          | 53 ++----------------
> >   lib/eal/x86/include/meson.build             |  1 +
> >   lib/eal/x86/include/rte_memcpy.h            | 11 +++-
> >   lib/fib/trie.c                              |  1 +
> >   meson_options.txt                           |  2 +
> >   22 files changed, 131 insertions(+), 121 deletions(-)
> > 
> 

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-25 19:27                     ` Mattias Rönnblom
@ 2024-06-26  8:37                       ` Maxime Coquelin
  2024-06-26 14:58                         ` Stephen Hemminger
  0 siblings, 1 reply; 128+ messages in thread
From: Maxime Coquelin @ 2024-06-26  8:37 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson



On 6/25/24 21:27, Mattias Rönnblom wrote:
> On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:
>> Hi Mattias,
>>
>> On 6/20/24 19:57, Mattias Rönnblom wrote:
>>> This patch set make DPDK library, driver, and application code use the
>>> compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
>>> invoked.
>>>
>>> The various custom DPDK rte_memcpy() implementations may be retained
>>> by means of a build-time option.
>>>
>>> This patch set only make a difference on x86, PPC and ARM. Loongarch
>>> and RISCV already used compiler/libc memcpy().
>>
>> It indeed makes a difference on x86!
>>
>> Just tested latest main with and without your series on
>> Intel(R) Xeon(R) Gold 6438N.
>>
>> The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
>> # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1'
>> --single-file-segments -- -i
>> testpmd> start
>>
>> # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
>> 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i
>> testpmd> start tx_first 32
>>
>> Latest main: 14.5Mpps
>> Latest main + this series: 10Mpps
>>
> 
> I ran the above benchmark on my Raptor Lake desktop (locked to 3,2
> GHz). GCC 12.3.0.
> 
> Core use_cc_memcpy Mpps
> E    false         9.5
> E    true          9.7
> P    false         16.4
> P    true          13.5
> 
> On the P-cores, there's a significant performance regression, although
> not as bad as the one you see on your Sapphire Rapids Xeon. On the
> E-cores, there's actually a slight performance gain.
> 
> The virtio PMD does not directly invoke rte_memcpy() or anything else
> from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> understand what's going on here. Does the virtio driver delegate some
> performance-critical task to some module that in turns uses
> rte_memcpy()?

This is because Vhost is the bottleneck here, not Virtio driver.
Indeed, the virtqueues memory belongs to the Virtio driver and the
descriptors buffers are Virtio's mbufs, so not much memcpy's are done
there.

Vhost however, is a heavy memcpy user, as all the descriptors buffers 
are copied to/from its mbufs.

>> So for me, it should be disabled by default.
>>
>> Regards,
>> Maxime
>>
>>> This patch set includes a number of fixes in drivers and libraries
>>> which errornously relied on <rte_memcpy.h> including header files
>>> (i.e., <rte_vect.h>) required by its implementation.
>>>
>>> Mattias Rönnblom (13):
>>>     net/i40e: add missing vector API header include
>>>     net/iavf: add missing vector API header include
>>>     net/ice: add missing vector API header include
>>>     net/ixgbe: add missing vector API header include
>>>     net/ngbe: add missing vector API header include
>>>     net/txgbe: add missing vector API header include
>>>     net/virtio: add missing vector API header include
>>>     net/fm10k: add missing vector API header include
>>>     event/dlb2: include headers for vector and memory copy APIs
>>>     net/octeon_ep: add missing vector API header include
>>>     distributor: add missing vector API header include
>>>     fib: add missing vector API header include
>>>     eal: provide option to use compiler memcpy instead of RTE
>>>
>>>    config/meson.build                          |  1 +
>>>    doc/guides/rel_notes/release_24_07.rst      | 21 +++++++
>>>    drivers/event/dlb2/dlb2.c                   |  2 +
>>>    drivers/net/fm10k/fm10k_rxtx_vec.c          |  3 +-
>>>    drivers/net/i40e/i40e_rxtx_vec_sse.c        |  3 +-
>>>    drivers/net/iavf/iavf_rxtx_vec_sse.c        |  3 +-
>>>    drivers/net/ice/ice_rxtx_vec_sse.c          |  2 +-
>>>    drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c      |  3 +-
>>>    drivers/net/ngbe/ngbe_rxtx_vec_sse.c        |  3 +-
>>>    drivers/net/octeon_ep/otx_ep_ethdev.c       |  2 +
>>>    drivers/net/txgbe/txgbe_rxtx_vec_sse.c      |  3 +-
>>>    drivers/net/virtio/virtio_rxtx_simple_sse.c |  3 +-
>>>    lib/distributor/rte_distributor.c           |  1 +
>>>    lib/eal/arm/include/rte_memcpy.h            | 10 ++++
>>>    lib/eal/include/generic/rte_memcpy.h        | 61 ++++++++++++++++++---
>>>    lib/eal/loongarch/include/rte_memcpy.h      | 53 ++----------------
>>>    lib/eal/ppc/include/rte_memcpy.h            | 10 ++++
>>>    lib/eal/riscv/include/rte_memcpy.h          | 53 ++----------------
>>>    lib/eal/x86/include/meson.build             |  1 +
>>>    lib/eal/x86/include/rte_memcpy.h            | 11 +++-
>>>    lib/fib/trie.c                              |  1 +
>>>    meson_options.txt                           |  2 +
>>>    22 files changed, 131 insertions(+), 121 deletions(-)
>>>
>>
> 


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-26  8:37                       ` Maxime Coquelin
@ 2024-06-26 14:58                         ` Stephen Hemminger
  2024-06-26 15:24                           ` Maxime Coquelin
  0 siblings, 1 reply; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-26 14:58 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Mattias Rönnblom, Mattias Rönnblom, dev,
	Morten Brørup, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

On Wed, 26 Jun 2024 10:37:31 +0200
Maxime Coquelin <maxime.coquelin@redhat.com> wrote:

> On 6/25/24 21:27, Mattias Rönnblom wrote:
> > On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:  
> >> Hi Mattias,
> >>
> >> On 6/20/24 19:57, Mattias Rönnblom wrote:  
> >>> This patch set make DPDK library, driver, and application code use the
> >>> compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
> >>> invoked.
> >>>
> >>> The various custom DPDK rte_memcpy() implementations may be retained
> >>> by means of a build-time option.
> >>>
> >>> This patch set only make a difference on x86, PPC and ARM. Loongarch
> >>> and RISCV already used compiler/libc memcpy().  
> >>
> >> It indeed makes a difference on x86!
> >>
> >> Just tested latest main with and without your series on
> >> Intel(R) Xeon(R) Gold 6438N.
> >>
> >> The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
> >> # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1'
> >> --single-file-segments -- -i  
> >> testpmd> start  
> >>
> >> # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> >> 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i  
> >> testpmd> start tx_first 32  
> >>
> >> Latest main: 14.5Mpps
> >> Latest main + this series: 10Mpps
> >>  
> > 
> > I ran the above benchmark on my Raptor Lake desktop (locked to 3,2
> > GHz). GCC 12.3.0.
> > 
> > Core use_cc_memcpy Mpps
> > E    false         9.5
> > E    true          9.7
> > P    false         16.4
> > P    true          13.5
> > 
> > On the P-cores, there's a significant performance regression, although
> > not as bad as the one you see on your Sapphire Rapids Xeon. On the
> > E-cores, there's actually a slight performance gain.
> > 
> > The virtio PMD does not directly invoke rte_memcpy() or anything else
> > from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> > understand what's going on here. Does the virtio driver delegate some
> > performance-critical task to some module that in turns uses
> > rte_memcpy()?  
> 
> This is because Vhost is the bottleneck here, not Virtio driver.
> Indeed, the virtqueues memory belongs to the Virtio driver and the
> descriptors buffers are Virtio's mbufs, so not much memcpy's are done
> there.
> 
> Vhost however, is a heavy memcpy user, as all the descriptors buffers 
> are copied to/from its mbufs.

Would be good to now the size (if small it is inlining that matters, or
maybe alignment matters), and have test results for multiple compiler versions.
Ideally, feed results back and update Gcc and Clang.

DPDK doesn't need to be in the optimize C library space.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-26 14:58                         ` Stephen Hemminger
@ 2024-06-26 15:24                           ` Maxime Coquelin
  2024-06-26 18:47                             ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Maxime Coquelin @ 2024-06-26 15:24 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Mattias Rönnblom, Mattias Rönnblom, dev,
	Morten Brørup, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson



On 6/26/24 16:58, Stephen Hemminger wrote:
> On Wed, 26 Jun 2024 10:37:31 +0200
> Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> 
>> On 6/25/24 21:27, Mattias Rönnblom wrote:
>>> On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:
>>>> Hi Mattias,
>>>>
>>>> On 6/20/24 19:57, Mattias Rönnblom wrote:
>>>>> This patch set make DPDK library, driver, and application code use the
>>>>> compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
>>>>> invoked.
>>>>>
>>>>> The various custom DPDK rte_memcpy() implementations may be retained
>>>>> by means of a build-time option.
>>>>>
>>>>> This patch set only make a difference on x86, PPC and ARM. Loongarch
>>>>> and RISCV already used compiler/libc memcpy().
>>>>
>>>> It indeed makes a difference on x86!
>>>>
>>>> Just tested latest main with and without your series on
>>>> Intel(R) Xeon(R) Gold 6438N.
>>>>
>>>> The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
>>>> # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1'
>>>> --single-file-segments -- -i
>>>> testpmd> start
>>>>
>>>> # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
>>>> 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i
>>>> testpmd> start tx_first 32
>>>>
>>>> Latest main: 14.5Mpps
>>>> Latest main + this series: 10Mpps
>>>>   
>>>
>>> I ran the above benchmark on my Raptor Lake desktop (locked to 3,2
>>> GHz). GCC 12.3.0.
>>>
>>> Core use_cc_memcpy Mpps
>>> E    false         9.5
>>> E    true          9.7
>>> P    false         16.4
>>> P    true          13.5
>>>
>>> On the P-cores, there's a significant performance regression, although
>>> not as bad as the one you see on your Sapphire Rapids Xeon. On the
>>> E-cores, there's actually a slight performance gain.
>>>
>>> The virtio PMD does not directly invoke rte_memcpy() or anything else
>>> from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
>>> understand what's going on here. Does the virtio driver delegate some
>>> performance-critical task to some module that in turns uses
>>> rte_memcpy()?
>>
>> This is because Vhost is the bottleneck here, not Virtio driver.
>> Indeed, the virtqueues memory belongs to the Virtio driver and the
>> descriptors buffers are Virtio's mbufs, so not much memcpy's are done
>> there.
>>
>> Vhost however, is a heavy memcpy user, as all the descriptors buffers
>> are copied to/from its mbufs.
> 
> Would be good to now the size (if small it is inlining that matters, or
> maybe alignment matters), and have test results for multiple compiler versions.
> Ideally, feed results back and update Gcc and Clang.

I was testing with GCC 11 on RHEL-9:
gcc (GCC) 11.4.1 20231218 (Red Hat 11.4.1-3)

I was using the default one, 64B packets.

I don't have time to perform these tests, but if you are willing to do
it I'll be happy to review the results.

> DPDK doesn't need to be in the optimize C library space.

Certainly, but we already have an optimized version currently, so not
much to do now on our side. When C libraries implementations will be on
par, we should definitely use them by default.

Maxime


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-26 15:24                           ` Maxime Coquelin
@ 2024-06-26 18:47                             ` Mattias Rönnblom
  2024-06-26 20:16                               ` Morten Brørup
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-26 18:47 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Stephen Hemminger, Mattias Rönnblom, dev,
	Morten Brørup, Abdullah Sevincer, Pavan Nikhilesh,
	David Hunt, Vladimir Medvedkin, Bruce Richardson

On Wed, Jun 26, 2024 at 05:24:04PM +0200, Maxime Coquelin wrote:
> 
> 
> On 6/26/24 16:58, Stephen Hemminger wrote:
> > On Wed, 26 Jun 2024 10:37:31 +0200
> > Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> > 
> > > On 6/25/24 21:27, Mattias Rönnblom wrote:
> > > > On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:
> > > > > Hi Mattias,
> > > > > 
> > > > > On 6/20/24 19:57, Mattias Rönnblom wrote:
> > > > > > This patch set make DPDK library, driver, and application code use the
> > > > > > compiler/libc memcpy() by default when functions in <rte_memcpy.h> are
> > > > > > invoked.
> > > > > > 
> > > > > > The various custom DPDK rte_memcpy() implementations may be retained
> > > > > > by means of a build-time option.
> > > > > > 
> > > > > > This patch set only make a difference on x86, PPC and ARM. Loongarch
> > > > > > and RISCV already used compiler/libc memcpy().
> > > > > 
> > > > > It indeed makes a difference on x86!
> > > > > 
> > > > > Just tested latest main with and without your series on
> > > > > Intel(R) Xeon(R) Gold 6438N.
> > > > > 
> > > > > The test is a simple IO loop between a Vhost PMD and a Virtio-user PMD:
> > > > > # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1,mrg_rxbuf=1,in_order=1'
> > > > > --single-file-segments -- -i
> > > > > testpmd> start
> > > > > 
> > > > > # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> > > > > 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments -- -i
> > > > > testpmd> start tx_first 32
> > > > > 
> > > > > Latest main: 14.5Mpps
> > > > > Latest main + this series: 10Mpps
> > > > 
> > > > I ran the above benchmark on my Raptor Lake desktop (locked to 3,2
> > > > GHz). GCC 12.3.0.
> > > > 
> > > > Core use_cc_memcpy Mpps
> > > > E    false         9.5
> > > > E    true          9.7
> > > > P    false         16.4
> > > > P    true          13.5
> > > > 
> > > > On the P-cores, there's a significant performance regression, although
> > > > not as bad as the one you see on your Sapphire Rapids Xeon. On the
> > > > E-cores, there's actually a slight performance gain.
> > > > 
> > > > The virtio PMD does not directly invoke rte_memcpy() or anything else
> > > > from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> > > > understand what's going on here. Does the virtio driver delegate some
> > > > performance-critical task to some module that in turns uses
> > > > rte_memcpy()?
> > > 
> > > This is because Vhost is the bottleneck here, not Virtio driver.
> > > Indeed, the virtqueues memory belongs to the Virtio driver and the
> > > descriptors buffers are Virtio's mbufs, so not much memcpy's are done
> > > there.
> > > 
> > > Vhost however, is a heavy memcpy user, as all the descriptors buffers
> > > are copied to/from its mbufs.
> > 
> > Would be good to now the size (if small it is inlining that matters, or
> > maybe alignment matters), and have test results for multiple compiler versions.
> > Ideally, feed results back and update Gcc and Clang.
> 
> I was testing with GCC 11 on RHEL-9:
> gcc (GCC) 11.4.1 20231218 (Red Hat 11.4.1-3)
> 
> I was using the default one, 64B packets.
> 
> I don't have time to perform these tests, but if you are willing to do
> it I'll be happy to review the results.
> 
> > DPDK doesn't need to be in the optimize C library space.
> 
> Certainly, but we already have an optimized version currently, so not
> much to do now on our side. When C libraries implementations will be on
> par, we should definitely use them by default.
>

I think it's not so much about optimized versus non-optimized at this
point. It's just that cc/libc memcpy sometimes performs better than
RTE memcpy, and sometimes doesn't.

For virtio, a single memory copy in
lib/vhost/virtio_net.c:do_data_copy_enqueue()
is responsible for >95% of the performance regression introduced by
the cc memcpy patch for small packets on Intel P-cores.

I'm not so sure this performance regression will go away in newer
compilers. PGO would certainly help, but PGO is a hassle.

One way to fix this issue would be to introduce a custom,
memcpy()-based packet copying routine. I tried the below patch, with
the following results:

Raptor Lake @ 3,2 GHz
GCC 12

64 bytes packets
Core  Mode              Mpps
----------------------------
E     RTE memcpy        9.5
E     cc memcpy         9.7
E     cc memcpy+pktcpy  9.0

P     RTE memcpy        16.4
P     cc memcpy         13.5
P     cc memcpy+pktcpy  16.2

1500 bytes
Core  Mode              Mpps
----------------------------
P    RTE memcpy         5.8
P    cc memcpy          5.9
P    cc memcpy+pktcpy   5.9

As you can see, most of the regression is eliminated, at the cost of
worse E-core performance. I didn't look at the generated code, but one
could suspect heavy use of wide SIMD is to blame, which E-cores don't
necessarily benefit from.

The below prototype assumes the source and destination buffers are
16-byte aligned. Does that always hold?

I'm sure one could further improve performance using context-specific
information, such as packets always being >= 64 bytes. One could also
consider having special cases, maybe for 64 bytes and MTU-sized
packets. Such are always a hassle when you try to characterize
performance though.

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 370402d849..7b595a6622 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -231,6 +231,26 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
 	return nr_copies;
 }
 
+static inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+	void *dst = __builtin_assume_aligned(in_dst, 16);
+	const void *src = __builtin_assume_aligned(in_src, 16);
+
+	if (len <= 256) {
+		size_t left;
+
+		for (left = len; left >= 32; left -= 32) {
+			memcpy(dst, src, 32);
+			dst = RTE_PTR_ADD(dst, 32);
+			src = RTE_PTR_ADD(src, 32);
+		}
+
+		memcpy(dst, src, left);
+	} else
+		memcpy(dst, src, len);
+}
+
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +260,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++) {
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
 					   elem[i].len);
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +277,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++)
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 
 	vq->batch_copy_nb_elems = 0;
 }



> Maxime
> 

^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-26 18:47                             ` Mattias Rönnblom
@ 2024-06-26 20:16                               ` Morten Brørup
  2024-06-27 11:06                                 ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Morten Brørup @ 2024-06-26 20:16 UTC (permalink / raw)
  To: Mattias Rönnblom, Maxime Coquelin
  Cc: Stephen Hemminger, Mattias Rönnblom, dev, Abdullah Sevincer,
	Pavan Nikhilesh, David Hunt, Vladimir Medvedkin,
	Bruce Richardson

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 26 June 2024 20.48
> 
> On Wed, Jun 26, 2024 at 05:24:04PM +0200, Maxime Coquelin wrote:
> >
> >
> > On 6/26/24 16:58, Stephen Hemminger wrote:
> > > On Wed, 26 Jun 2024 10:37:31 +0200
> > > Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> > >
> > > > On 6/25/24 21:27, Mattias Rönnblom wrote:
> > > > > On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:
> > > > > > Hi Mattias,
> > > > > >
> > > > > > On 6/20/24 19:57, Mattias Rönnblom wrote:
> > > > > > > This patch set make DPDK library, driver, and application
> code use the
> > > > > > > compiler/libc memcpy() by default when functions in
> <rte_memcpy.h> are
> > > > > > > invoked.
> > > > > > >
> > > > > > > The various custom DPDK rte_memcpy() implementations may be
> retained
> > > > > > > by means of a build-time option.
> > > > > > >
> > > > > > > This patch set only make a difference on x86, PPC and ARM.
> Loongarch
> > > > > > > and RISCV already used compiler/libc memcpy().
> > > > > >
> > > > > > It indeed makes a difference on x86!
> > > > > >
> > > > > > Just tested latest main with and without your series on
> > > > > > Intel(R) Xeon(R) Gold 6438N.
> > > > > >
> > > > > > The test is a simple IO loop between a Vhost PMD and a Virtio-
> user PMD:
> > > > > > # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev
> 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-
> net,server=1,mrg_rxbuf=1,in_order=1'
> > > > > > --single-file-segments -- -i
> > > > > > testpmd> start
> > > > > >
> > > > > > # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> > > > > > 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments
> -- -i
> > > > > > testpmd> start tx_first 32
> > > > > >
> > > > > > Latest main: 14.5Mpps
> > > > > > Latest main + this series: 10Mpps
> > > > >
> > > > > I ran the above benchmark on my Raptor Lake desktop (locked to
> 3,2
> > > > > GHz). GCC 12.3.0.
> > > > >
> > > > > Core use_cc_memcpy Mpps
> > > > > E    false         9.5
> > > > > E    true          9.7
> > > > > P    false         16.4
> > > > > P    true          13.5
> > > > >
> > > > > On the P-cores, there's a significant performance regression,
> although
> > > > > not as bad as the one you see on your Sapphire Rapids Xeon. On
> the
> > > > > E-cores, there's actually a slight performance gain.
> > > > >
> > > > > The virtio PMD does not directly invoke rte_memcpy() or anything
> else
> > > > > from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> > > > > understand what's going on here. Does the virtio driver delegate
> some
> > > > > performance-critical task to some module that in turns uses
> > > > > rte_memcpy()?
> > > >
> > > > This is because Vhost is the bottleneck here, not Virtio driver.
> > > > Indeed, the virtqueues memory belongs to the Virtio driver and the
> > > > descriptors buffers are Virtio's mbufs, so not much memcpy's are
> done
> > > > there.
> > > >
> > > > Vhost however, is a heavy memcpy user, as all the descriptors
> buffers
> > > > are copied to/from its mbufs.
> > >
> > > Would be good to now the size (if small it is inlining that matters,
> or
> > > maybe alignment matters), and have test results for multiple
> compiler versions.
> > > Ideally, feed results back and update Gcc and Clang.
> >
> > I was testing with GCC 11 on RHEL-9:
> > gcc (GCC) 11.4.1 20231218 (Red Hat 11.4.1-3)
> >
> > I was using the default one, 64B packets.
> >
> > I don't have time to perform these tests, but if you are willing to do
> > it I'll be happy to review the results.
> >
> > > DPDK doesn't need to be in the optimize C library space.
> >
> > Certainly, but we already have an optimized version currently, so not
> > much to do now on our side. When C libraries implementations will be
> on
> > par, we should definitely use them by default.
> >
> 
> I think it's not so much about optimized versus non-optimized at this
> point. It's just that cc/libc memcpy sometimes performs better than
> RTE memcpy, and sometimes doesn't.
> 
> For virtio, a single memory copy in
> lib/vhost/virtio_net.c:do_data_copy_enqueue()
> is responsible for >95% of the performance regression introduced by
> the cc memcpy patch for small packets on Intel P-cores.
> 
> I'm not so sure this performance regression will go away in newer
> compilers. PGO would certainly help, but PGO is a hassle.
> 
> One way to fix this issue would be to introduce a custom,
> memcpy()-based packet copying routine. I tried the below patch, with
> the following results:
> 
> Raptor Lake @ 3,2 GHz
> GCC 12
> 
> 64 bytes packets
> Core  Mode              Mpps
> ----------------------------
> E     RTE memcpy        9.5
> E     cc memcpy         9.7
> E     cc memcpy+pktcpy  9.0
> 
> P     RTE memcpy        16.4
> P     cc memcpy         13.5
> P     cc memcpy+pktcpy  16.2
> 
> 1500 bytes
> Core  Mode              Mpps
> ----------------------------
> P    RTE memcpy         5.8
> P    cc memcpy          5.9
> P    cc memcpy+pktcpy   5.9
> 
> As you can see, most of the regression is eliminated, at the cost of
> worse E-core performance. I didn't look at the generated code, but one
> could suspect heavy use of wide SIMD is to blame, which E-cores don't
> necessarily benefit from.
> 
> The below prototype assumes the source and destination buffers are
> 16-byte aligned. Does that always hold?

Perhaps always for this specific function; I don't know.
Not generally *always*, but I guess in many cases packet copies would have 64-byte aligned pointers, but not sizes.
Unless explicitly stated by the developer, it is unsafe to make assumptions about alignment.

A future rte_memcpy() function might take flags with explicit alignment information for optimized copying, as was part of my non-temporal memcpy(). (The development on this is still on hold.)

> 
> I'm sure one could further improve performance using context-specific
> information, such as packets always being >= 64 bytes. One could also
> consider having special cases, maybe for 64 bytes and MTU-sized
> packets. Such are always a hassle when you try to characterize
> performance though.

Absolutely!

This got me thinking:
These tests are run with 64 byte packets only.
Perhaps branch prediction pollutes the results, by optimizing branches in the copy routine for all packets being 64 byte.

You really should be testing with IMIX or random packet sizes.

In my experience, most internet packets are large (> 1024 byte, but not 1514 byte due to QUIC's conservative max packet size), closely followed by 64 byte (excl. any VLAN tags) packets; only the minority of packets are medium size.

> 
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 370402d849..7b595a6622 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -231,6 +231,26 @@ vhost_async_dma_check_completed(struct virtio_net
> *dev, int16_t dma_id, uint16_t
>  	return nr_copies;
>  }
> 
> +static inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{
> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);
> +
> +	if (len <= 256) {
> +		size_t left;
> +
> +		for (left = len; left >= 32; left -= 32) {
> +			memcpy(dst, src, 32);
> +			dst = RTE_PTR_ADD(dst, 32);
> +			src = RTE_PTR_ADD(src, 32);
> +		}
> +
> +		memcpy(dst, src, left);
> +	} else
> +		memcpy(dst, src, len);
> +}
> +
>  static inline void
>  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> *vq)
>  	__rte_shared_locks_required(&vq->iotlb_lock)
> @@ -240,7 +260,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++) {
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>  		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>  					   elem[i].len);
>  		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> @@ -257,7 +277,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++)
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> 
>  	vq->batch_copy_nb_elems = 0;
>  }
> 
> 
> 
> > Maxime
> >

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-26 20:16                               ` Morten Brørup
@ 2024-06-27 11:06                                 ` Mattias Rönnblom
  2024-06-27 15:10                                   ` Stephen Hemminger
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-27 11:06 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Maxime Coquelin, Stephen Hemminger, Mattias Rönnblom, dev,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin, Bruce Richardson

On Wed, Jun 26, 2024 at 10:16:06PM +0200, Morten Brørup wrote:
> > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > Sent: Wednesday, 26 June 2024 20.48
> > 
> > On Wed, Jun 26, 2024 at 05:24:04PM +0200, Maxime Coquelin wrote:
> > >
> > >
> > > On 6/26/24 16:58, Stephen Hemminger wrote:
> > > > On Wed, 26 Jun 2024 10:37:31 +0200
> > > > Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> > > >
> > > > > On 6/25/24 21:27, Mattias Rönnblom wrote:
> > > > > > On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:
> > > > > > > Hi Mattias,
> > > > > > >
> > > > > > > On 6/20/24 19:57, Mattias Rönnblom wrote:
> > > > > > > > This patch set make DPDK library, driver, and application
> > code use the
> > > > > > > > compiler/libc memcpy() by default when functions in
> > <rte_memcpy.h> are
> > > > > > > > invoked.
> > > > > > > >
> > > > > > > > The various custom DPDK rte_memcpy() implementations may be
> > retained
> > > > > > > > by means of a build-time option.
> > > > > > > >
> > > > > > > > This patch set only make a difference on x86, PPC and ARM.
> > Loongarch
> > > > > > > > and RISCV already used compiler/libc memcpy().
> > > > > > >
> > > > > > > It indeed makes a difference on x86!
> > > > > > >
> > > > > > > Just tested latest main with and without your series on
> > > > > > > Intel(R) Xeon(R) Gold 6438N.
> > > > > > >
> > > > > > > The test is a simple IO loop between a Vhost PMD and a Virtio-
> > user PMD:
> > > > > > > # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev
> > 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-
> > net,server=1,mrg_rxbuf=1,in_order=1'
> > > > > > > --single-file-segments -- -i
> > > > > > > testpmd> start
> > > > > > >
> > > > > > > # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> > > > > > > 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments
> > -- -i
> > > > > > > testpmd> start tx_first 32
> > > > > > >
> > > > > > > Latest main: 14.5Mpps
> > > > > > > Latest main + this series: 10Mpps
> > > > > >
> > > > > > I ran the above benchmark on my Raptor Lake desktop (locked to
> > 3,2
> > > > > > GHz). GCC 12.3.0.
> > > > > >
> > > > > > Core use_cc_memcpy Mpps
> > > > > > E    false         9.5
> > > > > > E    true          9.7
> > > > > > P    false         16.4
> > > > > > P    true          13.5
> > > > > >
> > > > > > On the P-cores, there's a significant performance regression,
> > although
> > > > > > not as bad as the one you see on your Sapphire Rapids Xeon. On
> > the
> > > > > > E-cores, there's actually a slight performance gain.
> > > > > >
> > > > > > The virtio PMD does not directly invoke rte_memcpy() or anything
> > else
> > > > > > from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> > > > > > understand what's going on here. Does the virtio driver delegate
> > some
> > > > > > performance-critical task to some module that in turns uses
> > > > > > rte_memcpy()?
> > > > >
> > > > > This is because Vhost is the bottleneck here, not Virtio driver.
> > > > > Indeed, the virtqueues memory belongs to the Virtio driver and the
> > > > > descriptors buffers are Virtio's mbufs, so not much memcpy's are
> > done
> > > > > there.
> > > > >
> > > > > Vhost however, is a heavy memcpy user, as all the descriptors
> > buffers
> > > > > are copied to/from its mbufs.
> > > >
> > > > Would be good to now the size (if small it is inlining that matters,
> > or
> > > > maybe alignment matters), and have test results for multiple
> > compiler versions.
> > > > Ideally, feed results back and update Gcc and Clang.
> > >
> > > I was testing with GCC 11 on RHEL-9:
> > > gcc (GCC) 11.4.1 20231218 (Red Hat 11.4.1-3)
> > >
> > > I was using the default one, 64B packets.
> > >
> > > I don't have time to perform these tests, but if you are willing to do
> > > it I'll be happy to review the results.
> > >
> > > > DPDK doesn't need to be in the optimize C library space.
> > >
> > > Certainly, but we already have an optimized version currently, so not
> > > much to do now on our side. When C libraries implementations will be
> > on
> > > par, we should definitely use them by default.
> > >
> > 
> > I think it's not so much about optimized versus non-optimized at this
> > point. It's just that cc/libc memcpy sometimes performs better than
> > RTE memcpy, and sometimes doesn't.
> > 
> > For virtio, a single memory copy in
> > lib/vhost/virtio_net.c:do_data_copy_enqueue()
> > is responsible for >95% of the performance regression introduced by
> > the cc memcpy patch for small packets on Intel P-cores.
> > 
> > I'm not so sure this performance regression will go away in newer
> > compilers. PGO would certainly help, but PGO is a hassle.
> > 
> > One way to fix this issue would be to introduce a custom,
> > memcpy()-based packet copying routine. I tried the below patch, with
> > the following results:
> > 
> > Raptor Lake @ 3,2 GHz
> > GCC 12
> > 
> > 64 bytes packets
> > Core  Mode              Mpps
> > ----------------------------
> > E     RTE memcpy        9.5
> > E     cc memcpy         9.7
> > E     cc memcpy+pktcpy  9.0
> > 
> > P     RTE memcpy        16.4
> > P     cc memcpy         13.5
> > P     cc memcpy+pktcpy  16.2
> > 
> > 1500 bytes
> > Core  Mode              Mpps
> > ----------------------------
> > P    RTE memcpy         5.8
> > P    cc memcpy          5.9
> > P    cc memcpy+pktcpy   5.9
> > 
> > As you can see, most of the regression is eliminated, at the cost of
> > worse E-core performance. I didn't look at the generated code, but one
> > could suspect heavy use of wide SIMD is to blame, which E-cores don't
> > necessarily benefit from.
> > 
> > The below prototype assumes the source and destination buffers are
> > 16-byte aligned. Does that always hold?
> 
> Perhaps always for this specific function; I don't know.
> Not generally *always*, but I guess in many cases packet copies would have 64-byte aligned pointers, but not sizes.
> Unless explicitly stated by the developer, it is unsafe to make assumptions about alignment.
>

I meant always (for every packet) in DPDK virtio net.

> A future rte_memcpy() function might take flags with explicit alignment information for optimized copying, as was part of my non-temporal memcpy(). (The development on this is still on hold.)
>

There is already a mechanism to express alignment in GCC-compatible
compilers. See the below patch.

> > 
> > I'm sure one could further improve performance using context-specific
> > information, such as packets always being >= 64 bytes. One could also
> > consider having special cases, maybe for 64 bytes and MTU-sized
> > packets. Such are always a hassle when you try to characterize
> > performance though.
> 
> Absolutely!
> 
> This got me thinking:
> These tests are run with 64 byte packets only.
> Perhaps branch prediction pollutes the results, by optimizing branches in the copy routine for all packets being 64 byte.
> 
> You really should be testing with IMIX or random packet sizes.
>

Sure, and it should also be a real app, not testpmd, on top of virtio.

> In my experience, most internet packets are large (> 1024 byte, but not 1514 byte due to QUIC's conservative max packet size), closely followed by 64 byte (excl. any VLAN tags) packets; only the minority of packets are medium size.
> 
> > 
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > index 370402d849..7b595a6622 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -231,6 +231,26 @@ vhost_async_dma_check_completed(struct virtio_net
> > *dev, int16_t dma_id, uint16_t
> >  	return nr_copies;
> >  }
> > 
> > +static inline void
> > +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> > +{
> > +	void *dst = __builtin_assume_aligned(in_dst, 16);
> > +	const void *src = __builtin_assume_aligned(in_src, 16);
> > +
> > +	if (len <= 256) {
> > +		size_t left;
> > +
> > +		for (left = len; left >= 32; left -= 32) {
> > +			memcpy(dst, src, 32);
> > +			dst = RTE_PTR_ADD(dst, 32);
> > +			src = RTE_PTR_ADD(src, 32);
> > +		}
> > +
> > +		memcpy(dst, src, left);
> > +	} else
> > +		memcpy(dst, src, len);
> > +}
> > +
> >  static inline void
> >  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> > *vq)
> >  	__rte_shared_locks_required(&vq->iotlb_lock)
> > @@ -240,7 +260,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> > vhost_virtqueue *vq)
> >  	int i;
> > 
> >  	for (i = 0; i < count; i++) {
> > -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> > +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> >  		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
> >  					   elem[i].len);
> >  		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> > @@ -257,7 +277,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
> >  	int i;
> > 
> >  	for (i = 0; i < count; i++)
> > -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> > +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> > 
> >  	vq->batch_copy_nb_elems = 0;
> >  }
> > 
> > 
> > 
> > > Maxime
> > >

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-27 11:06                                 ` Mattias Rönnblom
@ 2024-06-27 15:10                                   ` Stephen Hemminger
  2024-06-27 15:23                                     ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Stephen Hemminger @ 2024-06-27 15:10 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Morten Brørup, Maxime Coquelin, Mattias Rönnblom, dev,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin, Bruce Richardson

On Thu, 27 Jun 2024 13:06:22 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> On Wed, Jun 26, 2024 at 10:16:06PM +0200, Morten Brørup wrote:
> > > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > > Sent: Wednesday, 26 June 2024 20.48
> > > 
> > > On Wed, Jun 26, 2024 at 05:24:04PM +0200, Maxime Coquelin wrote:  
> > > >
> > > >
> > > > On 6/26/24 16:58, Stephen Hemminger wrote:  
> > > > > On Wed, 26 Jun 2024 10:37:31 +0200
> > > > > Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> > > > >  
> > > > > > On 6/25/24 21:27, Mattias Rönnblom wrote:  
> > > > > > > On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:  
> > > > > > > > Hi Mattias,
> > > > > > > >
> > > > > > > > On 6/20/24 19:57, Mattias Rönnblom wrote:  
> > > > > > > > > This patch set make DPDK library, driver, and application  
> > > code use the  
> > > > > > > > > compiler/libc memcpy() by default when functions in  
> > > <rte_memcpy.h> are  
> > > > > > > > > invoked.
> > > > > > > > >
> > > > > > > > > The various custom DPDK rte_memcpy() implementations may be  
> > > retained  
> > > > > > > > > by means of a build-time option.
> > > > > > > > >
> > > > > > > > > This patch set only make a difference on x86, PPC and ARM.  
> > > Loongarch  
> > > > > > > > > and RISCV already used compiler/libc memcpy().  
> > > > > > > >
> > > > > > > > It indeed makes a difference on x86!
> > > > > > > >
> > > > > > > > Just tested latest main with and without your series on
> > > > > > > > Intel(R) Xeon(R) Gold 6438N.
> > > > > > > >
> > > > > > > > The test is a simple IO loop between a Vhost PMD and a Virtio-  
> > > user PMD:  
> > > > > > > > # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev  
> > > 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-
> > > net,server=1,mrg_rxbuf=1,in_order=1'  
> > > > > > > > --single-file-segments -- -i  
> > > > > > > > testpmd> start  
> > > > > > > >
> > > > > > > > # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> > > > > > > > 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments  
> > > -- -i  
> > > > > > > > testpmd> start tx_first 32  
> > > > > > > >
> > > > > > > > Latest main: 14.5Mpps
> > > > > > > > Latest main + this series: 10Mpps  
> > > > > > >
> > > > > > > I ran the above benchmark on my Raptor Lake desktop (locked to  
> > > 3,2  
> > > > > > > GHz). GCC 12.3.0.
> > > > > > >
> > > > > > > Core use_cc_memcpy Mpps
> > > > > > > E    false         9.5
> > > > > > > E    true          9.7
> > > > > > > P    false         16.4
> > > > > > > P    true          13.5
> > > > > > >
> > > > > > > On the P-cores, there's a significant performance regression,  
> > > although  
> > > > > > > not as bad as the one you see on your Sapphire Rapids Xeon. On  
> > > the  
> > > > > > > E-cores, there's actually a slight performance gain.
> > > > > > >
> > > > > > > The virtio PMD does not directly invoke rte_memcpy() or anything  
> > > else  
> > > > > > > from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> > > > > > > understand what's going on here. Does the virtio driver delegate  
> > > some  
> > > > > > > performance-critical task to some module that in turns uses
> > > > > > > rte_memcpy()?  
> > > > > >
> > > > > > This is because Vhost is the bottleneck here, not Virtio driver.
> > > > > > Indeed, the virtqueues memory belongs to the Virtio driver and the
> > > > > > descriptors buffers are Virtio's mbufs, so not much memcpy's are  
> > > done  
> > > > > > there.
> > > > > >
> > > > > > Vhost however, is a heavy memcpy user, as all the descriptors  
> > > buffers  
> > > > > > are copied to/from its mbufs.  
> > > > >
> > > > > Would be good to now the size (if small it is inlining that matters,  
> > > or  
> > > > > maybe alignment matters), and have test results for multiple  
> > > compiler versions.  
> > > > > Ideally, feed results back and update Gcc and Clang.  
> > > >
> > > > I was testing with GCC 11 on RHEL-9:
> > > > gcc (GCC) 11.4.1 20231218 (Red Hat 11.4.1-3)
> > > >
> > > > I was using the default one, 64B packets.
> > > >
> > > > I don't have time to perform these tests, but if you are willing to do
> > > > it I'll be happy to review the results.
> > > >  
> > > > > DPDK doesn't need to be in the optimize C library space.  
> > > >
> > > > Certainly, but we already have an optimized version currently, so not
> > > > much to do now on our side. When C libraries implementations will be  
> > > on  
> > > > par, we should definitely use them by default.
> > > >  
> > > 
> > > I think it's not so much about optimized versus non-optimized at this
> > > point. It's just that cc/libc memcpy sometimes performs better than
> > > RTE memcpy, and sometimes doesn't.
> > > 
> > > For virtio, a single memory copy in
> > > lib/vhost/virtio_net.c:do_data_copy_enqueue()
> > > is responsible for >95% of the performance regression introduced by
> > > the cc memcpy patch for small packets on Intel P-cores.
> > > 
> > > I'm not so sure this performance regression will go away in newer
> > > compilers. PGO would certainly help, but PGO is a hassle.
> > > 
> > > One way to fix this issue would be to introduce a custom,
> > > memcpy()-based packet copying routine. I tried the below patch, with
> > > the following results:
> > > 
> > > Raptor Lake @ 3,2 GHz
> > > GCC 12
> > > 
> > > 64 bytes packets
> > > Core  Mode              Mpps
> > > ----------------------------
> > > E     RTE memcpy        9.5
> > > E     cc memcpy         9.7
> > > E     cc memcpy+pktcpy  9.0
> > > 
> > > P     RTE memcpy        16.4
> > > P     cc memcpy         13.5
> > > P     cc memcpy+pktcpy  16.2
> > > 
> > > 1500 bytes
> > > Core  Mode              Mpps
> > > ----------------------------
> > > P    RTE memcpy         5.8
> > > P    cc memcpy          5.9
> > > P    cc memcpy+pktcpy   5.9
> > > 
> > > As you can see, most of the regression is eliminated, at the cost of
> > > worse E-core performance. I didn't look at the generated code, but one
> > > could suspect heavy use of wide SIMD is to blame, which E-cores don't
> > > necessarily benefit from.
> > > 
> > > The below prototype assumes the source and destination buffers are
> > > 16-byte aligned. Does that always hold?  
> > 
> > Perhaps always for this specific function; I don't know.
> > Not generally *always*, but I guess in many cases packet copies would have 64-byte aligned pointers, but not sizes.
> > Unless explicitly stated by the developer, it is unsafe to make assumptions about alignment.
> >  
> 
> I meant always (for every packet) in DPDK virtio net.
> 
> > A future rte_memcpy() function might take flags with explicit alignment information for optimized copying, as was part of my non-temporal memcpy(). (The development on this is still on hold.)
> >  
> 
> There is already a mechanism to express alignment in GCC-compatible
> compilers. See the below patch.
> 
> > > 
> > > I'm sure one could further improve performance using context-specific
> > > information, such as packets always being >= 64 bytes. One could also
> > > consider having special cases, maybe for 64 bytes and MTU-sized
> > > packets. Such are always a hassle when you try to characterize
> > > performance though.  
> > 
> > Absolutely!
> > 
> > This got me thinking:
> > These tests are run with 64 byte packets only.
> > Perhaps branch prediction pollutes the results, by optimizing branches in the copy routine for all packets being 64 byte.
> > 
> > You really should be testing with IMIX or random packet sizes.
> >  
> 
> Sure, and it should also be a real app, not testpmd, on top of virtio.
> 
> > In my experience, most internet packets are large (> 1024 byte, but not 1514 byte due to QUIC's conservative max packet size), closely followed by 64 byte (excl. any VLAN tags) packets; only the minority of packets are medium size.
> >   
> > > 
> > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > > index 370402d849..7b595a6622 100644
> > > --- a/lib/vhost/virtio_net.c
> > > +++ b/lib/vhost/virtio_net.c
> > > @@ -231,6 +231,26 @@ vhost_async_dma_check_completed(struct virtio_net
> > > *dev, int16_t dma_id, uint16_t
> > >  	return nr_copies;
> > >  }
> > > 
> > > +static inline void
> > > +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> > > +{
> > > +	void *dst = __builtin_assume_aligned(in_dst, 16);
> > > +	const void *src = __builtin_assume_aligned(in_src, 16);
> > > +
> > > +	if (len <= 256) {
> > > +		size_t left;
> > > +
> > > +		for (left = len; left >= 32; left -= 32) {
> > > +			memcpy(dst, src, 32);
> > > +			dst = RTE_PTR_ADD(dst, 32);
> > > +			src = RTE_PTR_ADD(src, 32);
> > > +		}
> > > +
> > > +		memcpy(dst, src, left);
> > > +	} else
> > > +		memcpy(dst, src, len);
> > > +}
> > > +

Is alignment an optimization or requirement? I can think of cases in tunneling
where the packet was received into mbuf (aligned) but then the application prepends
a header making the packet unaligned before sending.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-27 15:10                                   ` Stephen Hemminger
@ 2024-06-27 15:23                                     ` Mattias Rönnblom
  0 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-06-27 15:23 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Morten Brørup, Maxime Coquelin, Mattias Rönnblom, dev,
	Abdullah Sevincer, Pavan Nikhilesh, David Hunt,
	Vladimir Medvedkin, Bruce Richardson

On Thu, Jun 27, 2024 at 08:10:19AM -0700, Stephen Hemminger wrote:
> On Thu, 27 Jun 2024 13:06:22 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
> > On Wed, Jun 26, 2024 at 10:16:06PM +0200, Morten Brørup wrote:
> > > > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > > > Sent: Wednesday, 26 June 2024 20.48
> > > > 
> > > > On Wed, Jun 26, 2024 at 05:24:04PM +0200, Maxime Coquelin wrote:  
> > > > >
> > > > >
> > > > > On 6/26/24 16:58, Stephen Hemminger wrote:  
> > > > > > On Wed, 26 Jun 2024 10:37:31 +0200
> > > > > > Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> > > > > >  
> > > > > > > On 6/25/24 21:27, Mattias Rönnblom wrote:  
> > > > > > > > On Tue, Jun 25, 2024 at 05:29:35PM +0200, Maxime Coquelin wrote:  
> > > > > > > > > Hi Mattias,
> > > > > > > > >
> > > > > > > > > On 6/20/24 19:57, Mattias Rönnblom wrote:  
> > > > > > > > > > This patch set make DPDK library, driver, and application  
> > > > code use the  
> > > > > > > > > > compiler/libc memcpy() by default when functions in  
> > > > <rte_memcpy.h> are  
> > > > > > > > > > invoked.
> > > > > > > > > >
> > > > > > > > > > The various custom DPDK rte_memcpy() implementations may be  
> > > > retained  
> > > > > > > > > > by means of a build-time option.
> > > > > > > > > >
> > > > > > > > > > This patch set only make a difference on x86, PPC and ARM.  
> > > > Loongarch  
> > > > > > > > > > and RISCV already used compiler/libc memcpy().  
> > > > > > > > >
> > > > > > > > > It indeed makes a difference on x86!
> > > > > > > > >
> > > > > > > > > Just tested latest main with and without your series on
> > > > > > > > > Intel(R) Xeon(R) Gold 6438N.
> > > > > > > > >
> > > > > > > > > The test is a simple IO loop between a Vhost PMD and a Virtio-  
> > > > user PMD:  
> > > > > > > > > # dpdk-testpmd -l 4-6   --file-prefix=virtio1 --no-pci --vdev  
> > > > 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-
> > > > net,server=1,mrg_rxbuf=1,in_order=1'  
> > > > > > > > > --single-file-segments -- -i  
> > > > > > > > > testpmd> start  
> > > > > > > > >
> > > > > > > > > # dpdk-testpmd -l 8-10   --file-prefix=vhost1 --no-pci --vdev
> > > > > > > > > 'net_vhost0,iface=vhost-net,client=1'   --single-file-segments  
> > > > -- -i  
> > > > > > > > > testpmd> start tx_first 32  
> > > > > > > > >
> > > > > > > > > Latest main: 14.5Mpps
> > > > > > > > > Latest main + this series: 10Mpps  
> > > > > > > >
> > > > > > > > I ran the above benchmark on my Raptor Lake desktop (locked to  
> > > > 3,2  
> > > > > > > > GHz). GCC 12.3.0.
> > > > > > > >
> > > > > > > > Core use_cc_memcpy Mpps
> > > > > > > > E    false         9.5
> > > > > > > > E    true          9.7
> > > > > > > > P    false         16.4
> > > > > > > > P    true          13.5
> > > > > > > >
> > > > > > > > On the P-cores, there's a significant performance regression,  
> > > > although  
> > > > > > > > not as bad as the one you see on your Sapphire Rapids Xeon. On  
> > > > the  
> > > > > > > > E-cores, there's actually a slight performance gain.
> > > > > > > >
> > > > > > > > The virtio PMD does not directly invoke rte_memcpy() or anything  
> > > > else  
> > > > > > > > from <rte_memcpy.h>, but rather use memcpy(), so I'm not sure I
> > > > > > > > understand what's going on here. Does the virtio driver delegate  
> > > > some  
> > > > > > > > performance-critical task to some module that in turns uses
> > > > > > > > rte_memcpy()?  
> > > > > > >
> > > > > > > This is because Vhost is the bottleneck here, not Virtio driver.
> > > > > > > Indeed, the virtqueues memory belongs to the Virtio driver and the
> > > > > > > descriptors buffers are Virtio's mbufs, so not much memcpy's are  
> > > > done  
> > > > > > > there.
> > > > > > >
> > > > > > > Vhost however, is a heavy memcpy user, as all the descriptors  
> > > > buffers  
> > > > > > > are copied to/from its mbufs.  
> > > > > >
> > > > > > Would be good to now the size (if small it is inlining that matters,  
> > > > or  
> > > > > > maybe alignment matters), and have test results for multiple  
> > > > compiler versions.  
> > > > > > Ideally, feed results back and update Gcc and Clang.  
> > > > >
> > > > > I was testing with GCC 11 on RHEL-9:
> > > > > gcc (GCC) 11.4.1 20231218 (Red Hat 11.4.1-3)
> > > > >
> > > > > I was using the default one, 64B packets.
> > > > >
> > > > > I don't have time to perform these tests, but if you are willing to do
> > > > > it I'll be happy to review the results.
> > > > >  
> > > > > > DPDK doesn't need to be in the optimize C library space.  
> > > > >
> > > > > Certainly, but we already have an optimized version currently, so not
> > > > > much to do now on our side. When C libraries implementations will be  
> > > > on  
> > > > > par, we should definitely use them by default.
> > > > >  
> > > > 
> > > > I think it's not so much about optimized versus non-optimized at this
> > > > point. It's just that cc/libc memcpy sometimes performs better than
> > > > RTE memcpy, and sometimes doesn't.
> > > > 
> > > > For virtio, a single memory copy in
> > > > lib/vhost/virtio_net.c:do_data_copy_enqueue()
> > > > is responsible for >95% of the performance regression introduced by
> > > > the cc memcpy patch for small packets on Intel P-cores.
> > > > 
> > > > I'm not so sure this performance regression will go away in newer
> > > > compilers. PGO would certainly help, but PGO is a hassle.
> > > > 
> > > > One way to fix this issue would be to introduce a custom,
> > > > memcpy()-based packet copying routine. I tried the below patch, with
> > > > the following results:
> > > > 
> > > > Raptor Lake @ 3,2 GHz
> > > > GCC 12
> > > > 
> > > > 64 bytes packets
> > > > Core  Mode              Mpps
> > > > ----------------------------
> > > > E     RTE memcpy        9.5
> > > > E     cc memcpy         9.7
> > > > E     cc memcpy+pktcpy  9.0
> > > > 
> > > > P     RTE memcpy        16.4
> > > > P     cc memcpy         13.5
> > > > P     cc memcpy+pktcpy  16.2
> > > > 
> > > > 1500 bytes
> > > > Core  Mode              Mpps
> > > > ----------------------------
> > > > P    RTE memcpy         5.8
> > > > P    cc memcpy          5.9
> > > > P    cc memcpy+pktcpy   5.9
> > > > 
> > > > As you can see, most of the regression is eliminated, at the cost of
> > > > worse E-core performance. I didn't look at the generated code, but one
> > > > could suspect heavy use of wide SIMD is to blame, which E-cores don't
> > > > necessarily benefit from.
> > > > 
> > > > The below prototype assumes the source and destination buffers are
> > > > 16-byte aligned. Does that always hold?  
> > > 
> > > Perhaps always for this specific function; I don't know.
> > > Not generally *always*, but I guess in many cases packet copies would have 64-byte aligned pointers, but not sizes.
> > > Unless explicitly stated by the developer, it is unsafe to make assumptions about alignment.
> > >  
> > 
> > I meant always (for every packet) in DPDK virtio net.
> > 
> > > A future rte_memcpy() function might take flags with explicit alignment information for optimized copying, as was part of my non-temporal memcpy(). (The development on this is still on hold.)
> > >  
> > 
> > There is already a mechanism to express alignment in GCC-compatible
> > compilers. See the below patch.
> > 
> > > > 
> > > > I'm sure one could further improve performance using context-specific
> > > > information, such as packets always being >= 64 bytes. One could also
> > > > consider having special cases, maybe for 64 bytes and MTU-sized
> > > > packets. Such are always a hassle when you try to characterize
> > > > performance though.  
> > > 
> > > Absolutely!
> > > 
> > > This got me thinking:
> > > These tests are run with 64 byte packets only.
> > > Perhaps branch prediction pollutes the results, by optimizing branches in the copy routine for all packets being 64 byte.
> > > 
> > > You really should be testing with IMIX or random packet sizes.
> > >  
> > 
> > Sure, and it should also be a real app, not testpmd, on top of virtio.
> > 
> > > In my experience, most internet packets are large (> 1024 byte, but not 1514 byte due to QUIC's conservative max packet size), closely followed by 64 byte (excl. any VLAN tags) packets; only the minority of packets are medium size.
> > >   
> > > > 
> > > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > > > index 370402d849..7b595a6622 100644
> > > > --- a/lib/vhost/virtio_net.c
> > > > +++ b/lib/vhost/virtio_net.c
> > > > @@ -231,6 +231,26 @@ vhost_async_dma_check_completed(struct virtio_net
> > > > *dev, int16_t dma_id, uint16_t
> > > >  	return nr_copies;
> > > >  }
> > > > 
> > > > +static inline void
> > > > +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> > > > +{
> > > > +	void *dst = __builtin_assume_aligned(in_dst, 16);
> > > > +	const void *src = __builtin_assume_aligned(in_src, 16);
> > > > +
> > > > +	if (len <= 256) {
> > > > +		size_t left;
> > > > +
> > > > +		for (left = len; left >= 32; left -= 32) {
> > > > +			memcpy(dst, src, 32);
> > > > +			dst = RTE_PTR_ADD(dst, 32);
> > > > +			src = RTE_PTR_ADD(src, 32);
> > > > +		}
> > > > +
> > > > +		memcpy(dst, src, left);
> > > > +	} else
> > > > +		memcpy(dst, src, len);
> > > > +}
> > > > +
> 
> Is alignment an optimization or requirement? I can think of cases in tunneling
> where the packet was received into mbuf (aligned) but then the application prepends
> a header making the packet unaligned before sending.

It is an optimization and it ends up requring the packet (or should I say frame?) start address to be 16-byte aligned. You can avoid it may adding an alignment-check in the pktcpy() function (must like how rte_memcpy() does it), but that will come at a cost.

I'm guessin the contents on the frame doesn't matter, so if you add a header or two shouldn't make a difference.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy
  2024-06-20 17:57                   ` [PATCH v4 01/13] net/i40e: add missing vector API header include Mattias Rönnblom
@ 2024-07-24  7:53                     ` Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 1/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
                                         ` (5 more replies)
  0 siblings, 6 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

Note: These changes are meant for 24.11, not 24.07.

This patch set make DPDK library, driver, and application code
optionally use the compiler/libc memcpy() when functions in
<rte_memcpy.h> are invoked.

The new compiler memcpy-based rte_memcpy() implementations may be
enabled by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

This patch set includes a number of fixes in drivers and libraries
which errornously relied on <rte_memcpy.h> including header files
(i.e., <rte_vect.h>) required by its implementation.

Mattias Rönnblom (6):
  net/octeon_ep: add missing vector API header include
  distributor: add missing vector API header include
  fib: add missing vector API header include
  eal: provide option to use compiler memcpy instead of RTE
  ci: test compiler memcpy
  vhost: optimize memcpy routines when cc memcpy is used

 .ci/linux-build.sh                     |  5 +++
 .github/workflows/build.yml            |  7 +++
 config/meson.build                     |  1 +
 devtools/test-meson-builds.sh          |  4 +-
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  2 +
 lib/distributor/rte_distributor.c      |  1 +
 lib/eal/arm/include/rte_memcpy.h       |  9 ++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 54 +----------------------
 lib/eal/ppc/include/rte_memcpy.h       |  9 ++++
 lib/eal/riscv/include/rte_memcpy.h     | 54 +----------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       |  9 ++++
 lib/fib/trie.c                         |  1 +
 lib/vhost/virtio_net.c                 | 37 +++++++++++++++-
 meson_options.txt                      |  2 +
 17 files changed, 164 insertions(+), 114 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 1/6] net/octeon_ep: add missing vector API header include
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-07-24  7:53                       ` Mattias Rönnblom
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 2/6] distributor: add missing vector API header include Mattias Rönnblom
                                         ` (4 subsequent siblings)
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 2/6] distributor: add missing vector API header include
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 1/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
@ 2024-07-24  7:53                       ` Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 3/6] fib: " Mattias Rönnblom
                                         ` (3 subsequent siblings)
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 3/6] fib: add missing vector API header include
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 1/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 2/6] distributor: add missing vector API header include Mattias Rönnblom
@ 2024-07-24  7:53                       ` Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 4/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
                                         ` (2 subsequent siblings)
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 4/6] eal: provide option to use compiler memcpy instead of RTE
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                         ` (2 preceding siblings ...)
  2024-07-24  7:53                       ` [PATCH v5 3/6] fib: " Mattias Rönnblom
@ 2024-07-24  7:53                       ` Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 5/6] ci: test compiler memcpy Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default, the
traditional, custom DPDK rte_memcpy() implementation is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>

---

PATCH v5:
 o Take a more cautious approach, setting use_cc_memcpy to disabled by
   default.
 o Fix ARM build issue in case RTE_ARCH_ARM64_MEMCPY was set.
 o Use separate macros to indicate that the rte_memcpy() is implemented
   by the compiler, and that use_cc_memcpy is set, to avoid accidental
   <rte_build_config.h> #undefs.
 o Remove redundant rte_config.h includes.

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 lib/eal/arm/include/rte_memcpy.h       |  9 ++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 54 +----------------------
 lib/eal/ppc/include/rte_memcpy.h       |  9 ++++
 lib/eal/riscv/include/rte_memcpy.h     | 54 +----------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       |  9 ++++
 meson_options.txt                      |  2 +
 10 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index eb2ed1a55f..31af6303b3 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -24,6 +24,27 @@ DPDK Release 24.07
 New Features
 ------------
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now optionally
+  delegates to the standard memcpy() function, implemented by the
+  compiler and the C runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still the default. Compiler memcpy is
+  enabled by setting the new ``use_cc_memcpy`` build option to true.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of using compiler memcpy is that compilers and
+  static analysis tools have an easier time detecting incorrect usage
+  of rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
+
 .. This section should contain new features added in this release.
    Sample format:
 
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..5d2ea7dbfa 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,19 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#if defined(RTE_USE_CC_MEMCPY) || !defined(RTE_ARCH_ARM64_MEMCPY)
+
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cfb0175bd2 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..4e6027caee 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,7 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
-
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..162c1483f5 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,13 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#ifdef RTE_USE_CC_MEMCPY
+
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +222,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..7f6c07d090 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,7 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
-
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 42058e4a3f..2d9f5954f1 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,6 +11,13 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#ifdef RTE_USE_CC_MEMCPY
+
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
@@ -767,4 +774,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..69a01f6578 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: false, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 5/6] ci: test compiler memcpy
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                         ` (3 preceding siblings ...)
  2024-07-24  7:53                       ` [PATCH v5 4/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-07-24  7:53                       ` Mattias Rönnblom
  2024-07-24  7:53                       ` [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
  5 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

Add compilation tests for the use_cc_memcpy build option.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 .ci/linux-build.sh            | 5 +++++
 .github/workflows/build.yml   | 7 +++++++
 devtools/test-meson-builds.sh | 4 +++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh
index 15ed51e4c1..a873f83d09 100755
--- a/.ci/linux-build.sh
+++ b/.ci/linux-build.sh
@@ -98,6 +98,11 @@ if [ "$STDATOMIC" = "true" ]; then
 else
 	OPTS="$OPTS -Dcheck_includes=true"
 fi
+if [ "$CCMEMCPY" = "true" ]; then
+	OPTS="$OPTS -Duse_cc_memcpy=true"
+else
+	OPTS="$OPTS -Duse_cc_memcpy=true"
+fi
 if [ "$MINI" = "true" ]; then
     OPTS="$OPTS -Denable_drivers=net/null"
     OPTS="$OPTS -Ddisable_libs=*"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dbf25626d4..cd45d6c6c1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -31,6 +31,7 @@ jobs:
       RISCV64: ${{ matrix.config.cross == 'riscv64' }}
       RUN_TESTS: ${{ contains(matrix.config.checks, 'tests') }}
       STDATOMIC: ${{ contains(matrix.config.checks, 'stdatomic') }}
+      CCMEMCPY: ${{ contains(matrix.config.checks, 'ccmemcpy') }}
 
     strategy:
       fail-fast: false
@@ -45,6 +46,12 @@ jobs:
           - os: ubuntu-22.04
             compiler: clang
             checks: stdatomic
+          - os: ubuntu-22.04
+            compiler: gcc
+            checks: ccmemcpy
+          - os: ubuntu-22.04
+            compiler: clang
+            checks: ccmemcpy
           - os: ubuntu-22.04
             compiler: gcc
             checks: abi+debug+doc+examples+tests
diff --git a/devtools/test-meson-builds.sh b/devtools/test-meson-builds.sh
index d71bb1ded0..e72146be3b 100755
--- a/devtools/test-meson-builds.sh
+++ b/devtools/test-meson-builds.sh
@@ -228,12 +228,14 @@ for c in gcc clang ; do
 		if [ $s = shared ] ; then
 			abicheck=ABI
 			stdatomic=-Denable_stdatomic=true
+			ccmemcpy=-Duse_cc_memcpy=true
 		else
 			abicheck=skipABI # save time and disk space
 			stdatomic=-Denable_stdatomic=false
+			ccmemcpy=-Duse_cc_memcpy=false
 		fi
 		export CC="$CCACHE $c"
-		build build-$c-$s $c $abicheck $stdatomic --default-library=$s
+		build build-$c-$s $c $abicheck $stdatomic $ccmemcpy --default-library=$s
 		unset CC
 	done
 done
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used
  2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                         ` (4 preceding siblings ...)
  2024-07-24  7:53                       ` [PATCH v5 5/6] ci: test compiler memcpy Mattias Rönnblom
@ 2024-07-24  7:53                       ` Mattias Rönnblom
  2024-07-29 11:00                         ` Morten Brørup
  5 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-24  7:53 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

In build where use_cc_memcpy is set to true, the vhost user PMD
suffers a large performance drop on Intel P-cores for small packets,
at least when built by GCC and (to a much lesser extent) clang.

This patch addresses that issue by using a custom virtio
memcpy()-based packet copying routine.

Performance results from a Raptor Lake @ 3,2 GHz:

GCC 12.3.0
64 bytes packets
Core  Mode              Mpps
E     RTE memcpy        9.5
E     cc memcpy         9.7
E     cc memcpy+pktcpy  9.0

P     RTE memcpy        16.4
P     cc memcpy         13.5
P     cc memcpy+pktcpy  16.2

GCC 12.3.0
1500 bytes packets
Core  Mode              Mpps
P    RTE memcpy         5.8
P    cc memcpy          5.9
P    cc memcpy+pktcpy   5.9

clang 15.0.7
64 bytes packets
Core  Mode              Mpps
P     RTE memcpy        13.3
P     cc memcpy         12.9
P     cc memcpy+pktcpy  13.9

"RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
and "pktcpy" is when this patch is applied.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 370402d849..63571587a8 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -231,6 +231,39 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
 	return nr_copies;
 }
 
+/* The code generated by GCC (and to a lesser extent, clang) with just
+ * a straight memcpy() to copy packets is less than optimal on Intel
+ * P-cores, for small packets. Thus the need of this specialized
+ * memcpy() in builds where use_cc_memcpy is set to true.
+ */
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+	void *dst = __builtin_assume_aligned(in_dst, 16);
+	const void *src = __builtin_assume_aligned(in_src, 16);
+
+	if (len <= 256) {
+		size_t left;
+
+		for (left = len; left >= 32; left -= 32) {
+			memcpy(dst, src, 32);
+			dst = RTE_PTR_ADD(dst, 32);
+			src = RTE_PTR_ADD(src, 32);
+		}
+
+		memcpy(dst, src, left);
+	} else
+		memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+	rte_memcpy(dst, src, len);
+}
+#endif
+
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++) {
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
 					   elem[i].len);
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++)
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 
 	vq->batch_copy_nb_elems = 0;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used
  2024-07-24  7:53                       ` [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
@ 2024-07-29 11:00                         ` Morten Brørup
  2024-07-29 19:27                           ` Mattias Rönnblom
  0 siblings, 1 reply; 128+ messages in thread
From: Morten Brørup @ 2024-07-29 11:00 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
> Sent: Wednesday, 24 July 2024 09.54

Which packet mix was used for your tests? Synthetic IMIX, or some live data?

> +/* The code generated by GCC (and to a lesser extent, clang) with just
> + * a straight memcpy() to copy packets is less than optimal on Intel
> + * P-cores, for small packets. Thus the need of this specialized
> + * memcpy() in builds where use_cc_memcpy is set to true.
> + */
> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{
> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);
> +
> +	if (len <= 256) {
> +		size_t left;
> +
> +		for (left = len; left >= 32; left -= 32) {
> +			memcpy(dst, src, 32);
> +			dst = RTE_PTR_ADD(dst, 32);
> +			src = RTE_PTR_ADD(src, 32);
> +		}
> +
> +		memcpy(dst, src, left);
> +	} else

Although the packets within a burst often have similar size, I'm not sure you can rely on the dynamic branch predictor here.

Looking at the ethdev packet size counters at an ISP (at the core of their Layer 3 network), 71 % are 256 byte or larger [1].

For static branch prediction, I would consider > 256 more likely and swap the two branches, i.e. compare (len > 256) instead of (len <= 256).

But again: I don't know how the dynamic branch predictor behaves here. Perhaps my suggested change makes no difference.

> +		memcpy(dst, src, len);
> +}

With or without suggested change,
Acked-by: Morten Brørup <mb@smartsharesystems.com>


[1]: Details (incl. one VLAN tag)
tx_size_64_packets            1,1 %
tx_size_65_to_127_packets    25,7 %
tx_size_128_to_255_packets    2,6 %
tx_size_256_to_511_packets    1,4 %
tx_size_512_to_1023_packets   1,7 %
tx_size_1024_to_1522_packets 67,6 %


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used
  2024-07-29 11:00                         ` Morten Brørup
@ 2024-07-29 19:27                           ` Mattias Rönnblom
  2024-07-29 19:56                             ` Morten Brørup
  0 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-07-29 19:27 UTC (permalink / raw)
  To: Morten Brørup, Mattias Rönnblom, dev
  Cc: Stephen Hemminger, David Marchand, Pavan Nikhilesh, Bruce Richardson

On 2024-07-29 13:00, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
>> Sent: Wednesday, 24 July 2024 09.54
> 
> Which packet mix was used for your tests? Synthetic IMIX, or some live data?
> 

I used the same test as was being done when the performance regression 
was demonstrated (i.e., 2x testpmd with fixed packet size).

>> +/* The code generated by GCC (and to a lesser extent, clang) with just
>> + * a straight memcpy() to copy packets is less than optimal on Intel
>> + * P-cores, for small packets. Thus the need of this specialized
>> + * memcpy() in builds where use_cc_memcpy is set to true.
>> + */
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
>> +	void *dst = __builtin_assume_aligned(in_dst, 16);
>> +	const void *src = __builtin_assume_aligned(in_src, 16);
>> +
>> +	if (len <= 256) {
>> +		size_t left;
>> +
>> +		for (left = len; left >= 32; left -= 32) {
>> +			memcpy(dst, src, 32);
>> +			dst = RTE_PTR_ADD(dst, 32);
>> +			src = RTE_PTR_ADD(src, 32);
>> +		}
>> +
>> +		memcpy(dst, src, left);
>> +	} else
> 
> Although the packets within a burst often have similar size, I'm not sure you can rely on the dynamic branch predictor here.
> 

I agree that the pktcpy() routine will likely often suffer a 
size-related branch mispredict with real packet size mix. A benchmark 
with a real packet mix would be much better than the tests I've run.

This needs to be compared, of course, with the overhead imposed by 
conditionals included in other pktcpy() implementations.

> Looking at the ethdev packet size counters at an ISP (at the core of their Layer 3 network), 71 % are 256 byte or larger [1].
> 
> For static branch prediction, I would consider > 256 more likely and swap the two branches, i.e. compare (len > 256) instead of (len <= 256).
> 

OK, I'll add likely() instead, to make it more explicit.

> But again: I don't know how the dynamic branch predictor behaves here. Perhaps my suggested change makes no difference.
> 

I think it will, but it will be tiny. From what I understand, even when 
the branch prediction guessed correctly, one receive a slight benefit if 
the branch is not taken.

>> +		memcpy(dst, src, len);
>> +}
> 
> With or without suggested change,
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 
> 
> [1]: Details (incl. one VLAN tag)
> tx_size_64_packets            1,1 %
> tx_size_65_to_127_packets    25,7 %
> tx_size_128_to_255_packets    2,6 %
> tx_size_256_to_511_packets    1,4 %
> tx_size_512_to_1023_packets   1,7 %
> tx_size_1024_to_1522_packets 67,6 %
> 

^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used
  2024-07-29 19:27                           ` Mattias Rönnblom
@ 2024-07-29 19:56                             ` Morten Brørup
  0 siblings, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-07-29 19:56 UTC (permalink / raw)
  To: Mattias Rönnblom, Mattias Rönnblom, dev
  Cc: Stephen Hemminger, David Marchand, Pavan Nikhilesh, Bruce Richardson

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Monday, 29 July 2024 21.27
> 
> On 2024-07-29 13:00, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
> >> Sent: Wednesday, 24 July 2024 09.54
> >
> > Which packet mix was used for your tests? Synthetic IMIX, or some live
> data?
> >
> 
> I used the same test as was being done when the performance regression
> was demonstrated (i.e., 2x testpmd with fixed packet size).
> 
> >> +/* The code generated by GCC (and to a lesser extent, clang) with
> just
> >> + * a straight memcpy() to copy packets is less than optimal on Intel
> >> + * P-cores, for small packets. Thus the need of this specialized
> >> + * memcpy() in builds where use_cc_memcpy is set to true.
> >> + */
> >> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> >> +static __rte_always_inline void
> >> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t
> len)
> >> +{
> >> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> >> +	const void *src = __builtin_assume_aligned(in_src, 16);
> >> +
> >> +	if (len <= 256) {
> >> +		size_t left;
> >> +
> >> +		for (left = len; left >= 32; left -= 32) {
> >> +			memcpy(dst, src, 32);
> >> +			dst = RTE_PTR_ADD(dst, 32);
> >> +			src = RTE_PTR_ADD(src, 32);
> >> +		}
> >> +
> >> +		memcpy(dst, src, left);
> >> +	} else
> >
> > Although the packets within a burst often have similar size, I'm not
> sure you can rely on the dynamic branch predictor here.
> >
> 
> I agree that the pktcpy() routine will likely often suffer a
> size-related branch mispredict with real packet size mix. A benchmark
> with a real packet mix would be much better than the tests I've run.
> 
> This needs to be compared, of course, with the overhead imposed by
> conditionals included in other pktcpy() implementations.

If testing with fixed packet size, only one of the branches will be taken - always!
And thus the branch predictor will always predict it correctly - in the test.

So, if this code performs better than simple memcpy(), I can conclude that you are testing with packet size <= 256.

> 
> > Looking at the ethdev packet size counters at an ISP (at the core of
> their Layer 3 network), 71 % are 256 byte or larger [1].
> >
> > For static branch prediction, I would consider > 256 more likely and
> swap the two branches, i.e. compare (len > 256) instead of (len <= 256).
> >
> 
> OK, I'll add likely() instead, to make it more explicit.
> 
> > But again: I don't know how the dynamic branch predictor behaves here.
> Perhaps my suggested change makes no difference.
> >
> 
> I think it will, but it will be tiny. From what I understand, even when
> the branch prediction guessed correctly, one receive a slight benefit if
> the branch is not taken.
> 
> >> +		memcpy(dst, src, len);
> >> +}
> >
> > With or without suggested change,
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> >
> >
> > [1]: Details (incl. one VLAN tag)
> > tx_size_64_packets            1,1 %
> > tx_size_65_to_127_packets    25,7 %
> > tx_size_128_to_255_packets    2,6 %
> > tx_size_256_to_511_packets    1,4 %
> > tx_size_512_to_1023_packets   1,7 %
> > tx_size_1024_to_1522_packets 67,6 %
> >

^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy
  2024-07-24  7:53                       ` [PATCH v5 1/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
@ 2024-09-20 10:27                         ` Mattias Rönnblom
  2024-09-20 10:27                           ` [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
                                             ` (6 more replies)
  0 siblings, 7 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

This patch set make DPDK library, driver, and application code
optionally use the compiler/libc memcpy() when functions in
<rte_memcpy.h> are invoked.

The new compiler memcpy-based rte_memcpy() implementations may be
enabled by means of a build-time option.

This patch set only make a difference on x86, PPC and ARM. Loongarch
and RISCV already used compiler/libc memcpy().

This patch set includes a number of fixes in drivers and libraries
which errornously relied on <rte_memcpy.h> including header files
(i.e., <rte_vect.h>) required by its implementation.

Mattias Rönnblom (7):
  event/dlb2: include headers for vector and memory copy APIs
  net/octeon_ep: add missing vector API header include
  distributor: add missing vector API header include
  fib: add missing vector API header include
  eal: provide option to use compiler memcpy instead of RTE
  ci: test compiler memcpy
  vhost: optimize memcpy routines when cc memcpy is used

 .ci/linux-build.sh                     |  5 +++
 .github/workflows/build.yml            |  7 +++
 config/meson.build                     |  1 +
 devtools/test-meson-builds.sh          |  4 +-
 doc/guides/rel_notes/release_24_11.rst | 20 +++++++++
 drivers/event/dlb2/dlb2.c              |  2 +
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  2 +
 lib/distributor/rte_distributor.c      |  1 +
 lib/eal/arm/include/rte_memcpy.h       |  9 ++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 54 +----------------------
 lib/eal/ppc/include/rte_memcpy.h       |  9 ++++
 lib/eal/riscv/include/rte_memcpy.h     | 54 +----------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       |  9 ++++
 lib/fib/trie.c                         |  1 +
 lib/vhost/virtio_net.c                 | 37 +++++++++++++++-
 meson_options.txt                      |  2 +
 18 files changed, 165 insertions(+), 114 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-09 20:59                             ` Morten Brørup
  2024-10-09 22:01                             ` Stephen Hemminger
  2024-09-20 10:27                           ` [PATCH v6 2/7] net/octeon_ep: add missing vector API header include Mattias Rönnblom
                                             ` (5 subsequent siblings)
  6 siblings, 2 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
of <rte_memcpy.h> being included.

In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
but rather depended on other include files to do so.

This patch addresses both of those issues.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/event/dlb2/dlb2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..19f90b8f8d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -25,11 +25,13 @@
 #include <rte_kvargs.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_memcpy.h>
 #include <rte_mbuf.h>
 #include <rte_power_intrinsics.h>
 #include <rte_prefetch.h>
 #include <rte_ring.h>
 #include <rte_string_fns.h>
+#include <rte_vect.h>
 
 #include "dlb2_priv.h"
 #include "dlb2_iface.h"
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 2/7] net/octeon_ep: add missing vector API header include
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-09-20 10:27                           ` [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-09 21:00                             ` Morten Brørup
  2024-09-20 10:27                           ` [PATCH v6 3/7] distributor: " Mattias Rönnblom
                                             ` (4 subsequent siblings)
  6 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The octeon_ip driver relied on <rte_vect.h>, but failed to provide a
direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/octeon_ep/otx_ep_ethdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 46211361a0..b069216629 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -5,6 +5,8 @@
 #include <inttypes.h>
 #include <ethdev_pci.h>
 
+#include <rte_vect.h>
+
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
 #include "otx2_ep_vf.h"
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 3/7] distributor: add missing vector API header include
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
  2024-09-20 10:27                           ` [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
  2024-09-20 10:27                           ` [PATCH v6 2/7] net/octeon_ep: add missing vector API header include Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-09 21:00                             ` Morten Brørup
  2024-09-20 10:27                           ` [PATCH v6 4/7] fib: " Mattias Rönnblom
                                             ` (3 subsequent siblings)
  6 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The distributor library relied on <rte_vect.h>, but failed to provide
a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/distributor/rte_distributor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c
index e58727cdc2..1389efc03f 100644
--- a/lib/distributor/rte_distributor.c
+++ b/lib/distributor/rte_distributor.c
@@ -15,6 +15,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_pause.h>
 #include <rte_tailq.h>
+#include <rte_vect.h>
 
 #include "rte_distributor.h"
 #include "rte_distributor_single.h"
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 4/7] fib: add missing vector API header include
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                             ` (2 preceding siblings ...)
  2024-09-20 10:27                           ` [PATCH v6 3/7] distributor: " Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-09 21:00                             ` Morten Brørup
  2024-09-20 10:27                           ` [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
                                             ` (2 subsequent siblings)
  6 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

The trie implementation of the fib library relied on <rte_vect.h>, but
failed to provide a direct include of this file.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/fib/trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index 09470e7287..74db8863df 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -9,6 +9,7 @@
 #include <rte_debug.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
+#include <rte_vect.h>
 
 #include <rte_rib6.h>
 #include <rte_fib6.h>
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                             ` (3 preceding siblings ...)
  2024-09-20 10:27                           ` [PATCH v6 4/7] fib: " Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-04  7:52                             ` David Marchand
  2024-09-20 10:27                           ` [PATCH v6 6/7] ci: test compiler memcpy Mattias Rönnblom
  2024-09-20 10:27                           ` [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
  6 siblings, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default, the
traditional, custom DPDK rte_memcpy() implementation is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>

---

PATCH v6:
 * Rebase for 24.11.

PATCH v5:
 o Take a more cautious approach, setting use_cc_memcpy to disabled by
   default.
 o Fix ARM build issue in case RTE_ARCH_ARM64_MEMCPY was set.
 o Use separate macros to indicate that the rte_memcpy() is implemented
   by the compiler, and that use_cc_memcpy is set, to avoid accidental
   <rte_build_config.h> #undefs.
 o Remove redundant rte_config.h includes.

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_11.rst | 20 +++++++++
 lib/eal/arm/include/rte_memcpy.h       |  9 ++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 54 +----------------------
 lib/eal/ppc/include/rte_memcpy.h       |  9 ++++
 lib/eal/riscv/include/rte_memcpy.h     | 54 +----------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       |  9 ++++
 meson_options.txt                      |  2 +
 10 files changed, 109 insertions(+), 111 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..8be000294d 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -55,6 +55,26 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now optionally
+  delegates to the standard memcpy() function, implemented by the
+  compiler and the C runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still the default. Compiler memcpy is
+  enabled by setting the new ``use_cc_memcpy`` build option to true.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of using compiler memcpy is that compilers and
+  static analysis tools have an easier time detecting incorrect usage
+  of rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
 
 Removed Items
 -------------
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..5d2ea7dbfa 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,19 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#if defined(RTE_USE_CC_MEMCPY) || !defined(RTE_ARCH_ARM64_MEMCPY)
+
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cfb0175bd2 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+#endif /* RTE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..4e6027caee 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,7 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
-
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..162c1483f5 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,13 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#ifdef RTE_USE_CC_MEMCPY
+
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +222,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..7f6c07d090 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,7 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
-
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 42058e4a3f..2d9f5954f1 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,6 +11,13 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#ifdef RTE_USE_CC_MEMCPY
+
+#define RTE_CC_MEMCPY
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
@@ -767,4 +774,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..69a01f6578 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: false, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc memcpy() instead of using custom implementation.')
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 6/7] ci: test compiler memcpy
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                             ` (4 preceding siblings ...)
  2024-09-20 10:27                           ` [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-04  7:56                             ` David Marchand
  2024-10-09 21:04                             ` Morten Brørup
  2024-09-20 10:27                           ` [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
  6 siblings, 2 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

Add compilation tests for the use_cc_memcpy build option.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 .ci/linux-build.sh            | 5 +++++
 .github/workflows/build.yml   | 7 +++++++
 devtools/test-meson-builds.sh | 4 +++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh
index 15ed51e4c1..a873f83d09 100755
--- a/.ci/linux-build.sh
+++ b/.ci/linux-build.sh
@@ -98,6 +98,11 @@ if [ "$STDATOMIC" = "true" ]; then
 else
 	OPTS="$OPTS -Dcheck_includes=true"
 fi
+if [ "$CCMEMCPY" = "true" ]; then
+	OPTS="$OPTS -Duse_cc_memcpy=true"
+else
+	OPTS="$OPTS -Duse_cc_memcpy=true"
+fi
 if [ "$MINI" = "true" ]; then
     OPTS="$OPTS -Denable_drivers=net/null"
     OPTS="$OPTS -Ddisable_libs=*"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f7d3affbaa..8c52864294 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -31,6 +31,7 @@ jobs:
       RISCV64: ${{ matrix.config.cross == 'riscv64' }}
       RUN_TESTS: ${{ contains(matrix.config.checks, 'tests') }}
       STDATOMIC: ${{ contains(matrix.config.checks, 'stdatomic') }}
+      CCMEMCPY: ${{ contains(matrix.config.checks, 'ccmemcpy') }}
 
     strategy:
       fail-fast: false
@@ -45,6 +46,12 @@ jobs:
           - os: ubuntu-22.04
             compiler: clang
             checks: stdatomic
+          - os: ubuntu-22.04
+            compiler: gcc
+            checks: ccmemcpy
+          - os: ubuntu-22.04
+            compiler: clang
+            checks: ccmemcpy
           - os: ubuntu-22.04
             compiler: gcc
             checks: debug+doc+examples+tests
diff --git a/devtools/test-meson-builds.sh b/devtools/test-meson-builds.sh
index d71bb1ded0..e72146be3b 100755
--- a/devtools/test-meson-builds.sh
+++ b/devtools/test-meson-builds.sh
@@ -228,12 +228,14 @@ for c in gcc clang ; do
 		if [ $s = shared ] ; then
 			abicheck=ABI
 			stdatomic=-Denable_stdatomic=true
+			ccmemcpy=-Duse_cc_memcpy=true
 		else
 			abicheck=skipABI # save time and disk space
 			stdatomic=-Denable_stdatomic=false
+			ccmemcpy=-Duse_cc_memcpy=false
 		fi
 		export CC="$CCACHE $c"
-		build build-$c-$s $c $abicheck $stdatomic --default-library=$s
+		build build-$c-$s $c $abicheck $stdatomic $ccmemcpy --default-library=$s
 		unset CC
 	done
 done
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
  2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
                                             ` (5 preceding siblings ...)
  2024-09-20 10:27                           ` [PATCH v6 6/7] ci: test compiler memcpy Mattias Rönnblom
@ 2024-09-20 10:27                           ` Mattias Rönnblom
  2024-10-03 11:46                             ` Maxime Coquelin
                                               ` (2 more replies)
  6 siblings, 3 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 10:27 UTC (permalink / raw)
  To: dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson,
	Mattias Rönnblom

In build where use_cc_memcpy is set to true, the vhost user PMD
suffers a large performance drop on Intel P-cores for small packets,
at least when built by GCC and (to a much lesser extent) clang.

This patch addresses that issue by using a custom virtio
memcpy()-based packet copying routine.

Performance results from a Raptor Lake @ 3,2 GHz:

GCC 12.3.0
64 bytes packets
Core  Mode              Mpps
E     RTE memcpy        9.5
E     cc memcpy         9.7
E     cc memcpy+pktcpy  9.0

P     RTE memcpy        16.4
P     cc memcpy         13.5
P     cc memcpy+pktcpy  16.2

GCC 12.3.0
1500 bytes packets
Core  Mode              Mpps
P    RTE memcpy         5.8
P    cc memcpy          5.9
P    cc memcpy+pktcpy   5.9

clang 15.0.7
64 bytes packets
Core  Mode              Mpps
P     RTE memcpy        13.3
P     cc memcpy         12.9
P     cc memcpy+pktcpy  13.9

"RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
and "pktcpy" is when this patch is applied.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 370402d849..63571587a8 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -231,6 +231,39 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
 	return nr_copies;
 }
 
+/* The code generated by GCC (and to a lesser extent, clang) with just
+ * a straight memcpy() to copy packets is less than optimal on Intel
+ * P-cores, for small packets. Thus the need of this specialized
+ * memcpy() in builds where use_cc_memcpy is set to true.
+ */
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+	void *dst = __builtin_assume_aligned(in_dst, 16);
+	const void *src = __builtin_assume_aligned(in_src, 16);
+
+	if (len <= 256) {
+		size_t left;
+
+		for (left = len; left >= 32; left -= 32) {
+			memcpy(dst, src, 32);
+			dst = RTE_PTR_ADD(dst, 32);
+			src = RTE_PTR_ADD(src, 32);
+		}
+
+		memcpy(dst, src, left);
+	} else
+		memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+	rte_memcpy(dst, src, len);
+}
+#endif
+
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++) {
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
 					   elem[i].len);
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++)
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 
 	vq->batch_copy_nb_elems = 0;
 }
-- 
2.43.0


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
  2024-09-20 10:27                           ` [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
@ 2024-10-03 11:46                             ` Maxime Coquelin
  2024-10-09 21:25                             ` Morten Brørup
  2024-10-09 21:57                             ` Stephen Hemminger
  2 siblings, 0 replies; 128+ messages in thread
From: Maxime Coquelin @ 2024-10-03 11:46 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Morten Brørup, Stephen Hemminger,
	David Marchand, Pavan Nikhilesh, Bruce Richardson



On 9/20/24 12:27, Mattias Rönnblom wrote:
> In build where use_cc_memcpy is set to true, the vhost user PMD
> suffers a large performance drop on Intel P-cores for small packets,
> at least when built by GCC and (to a much lesser extent) clang.
> 
> This patch addresses that issue by using a custom virtio
> memcpy()-based packet copying routine.
> 
> Performance results from a Raptor Lake @ 3,2 GHz:
> 
> GCC 12.3.0
> 64 bytes packets
> Core  Mode              Mpps
> E     RTE memcpy        9.5
> E     cc memcpy         9.7
> E     cc memcpy+pktcpy  9.0
> 
> P     RTE memcpy        16.4
> P     cc memcpy         13.5
> P     cc memcpy+pktcpy  16.2
> 
> GCC 12.3.0
> 1500 bytes packets
> Core  Mode              Mpps
> P    RTE memcpy         5.8
> P    cc memcpy          5.9
> P    cc memcpy+pktcpy   5.9
> 
> clang 15.0.7
> 64 bytes packets
> Core  Mode              Mpps
> P     RTE memcpy        13.3
> P     cc memcpy         12.9
> P     cc memcpy+pktcpy  13.9
> 
> "RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
> and "pktcpy" is when this patch is applied.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>   lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
>   1 file changed, 35 insertions(+), 2 deletions(-)
> 

As the default behaviour remains unchanged, this is good to me:

Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE
  2024-09-20 10:27                           ` [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
@ 2024-10-04  7:52                             ` David Marchand
  2024-10-04  9:21                               ` Mattias Rönnblom
  2024-10-04  9:27                               ` Mattias Rönnblom
  0 siblings, 2 replies; 128+ messages in thread
From: David Marchand @ 2024-10-04  7:52 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, Mattias Rönnblom, Morten Brørup,
	Stephen Hemminger, Pavan Nikhilesh, Bruce Richardson

On Fri, Sep 20, 2024 at 12:36 PM Mattias Rönnblom
<mattias.ronnblom@ericsson.com> wrote:
>
> Provide build option to have functions in <rte_memcpy.h> delegate to
> the standard compiler/libc memcpy(), instead of using the various
> custom DPDK, handcrafted, per-architecture rte_memcpy()
> implementations.
>
> A new meson build option 'use_cc_memcpy' is added. By default, the
> traditional, custom DPDK rte_memcpy() implementation is used.
>
> The performance benefits of the custom DPDK rte_memcpy()
> implementations have been diminishing with every compiler release, and
> with current toolchains the use of a custom memcpy() implementation
> may even be a liability.
>
> An additional benefit of this change is that compilers and static
> analysis tools have an easier time detecting incorrect usage of
> rte_memcpy() (e.g., buffer overruns, or overlapping source and
> destination buffers).
>
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>

I like this patch and the direction we are taking: stop reinvent
memcpy and rely on compiler to optimize it.

I have some comments on the implementation.

- When I splitted headers in the early days of dpdk, the intention
with arch-specific headers in EAL was to have them include the generic
one, in all cases.
It seems that, over time, x86 rte_memcpy.h (at least) deviated from
this and stopped including generic/rte_memcpy.h...

So in this current patch, I expect every arch specific headers first
include generic/rte_memcpy.h, regardless of any arch-specific define
coming from the configuration.

An additional note on this, ARM32 and ARM64 have their own
implementation in rte_memcpy_32.h resp. rte_memcpy_64.h, and I would
check RTE_USE_CC_MEMCPY in each of them rather than in the top as
ARM32 and ARM64 are like two different arches.


- Now, looking at what was available for arches so far in DPDK:
* ARM was relying by default on compiler implementation, with specific
implementations for ARM32 and ARM64 available (see for more details
below) => possible values (default first) RTE_USE_CC_MEMCPY = true /
false
* loongarch was relying on compiler implementation, with no specific
implementations, => RTE_USE_CC_MEMCPY = true
* ppc was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
* risc was relying on compiler implementation, with no specific
implementations, => RTE_USE_CC_MEMCPY = true
* x86 was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false

We can't get a unified default value for a meson option and keep
compat for all arches (except maybe introduce a "auto" value).

Plus, disabling RTE_USE_CC_MEMCPY on loongarch and risc makes no
sense, as there was never a specific implementation.

My suggestion is to drop the meson option and instead just set
RTE_USE_CC_MEMCPY in config/$arch/meson.build.
Testers / interested users may edit config/$arch/meson.build on their own.


- Additionnally, ARM people have introduced arch-specific
implementation config options for memcpy in ARM32 resp. ARM64:
RTE_ARCH_ARM_NEON_MEMCPY resp. RTE_ARCH_ARM64_MEMCPY.
RTE_USE_CC_MEMCPY can replace those two options (we may keep some
compat in case someone relied on those defines for arm).
That removes the need for a RTE_CC_MEMCPY define.

More comments below:

[snip]

> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index 0ff70d9057..8be000294d 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -55,6 +55,26 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
>
> +* **Compiler memcpy replaces custom DPDK implementation.**
> +
> +  The memory copy functions of ``<rte_memcpy.h>`` now optionally
> +  delegates to the standard memcpy() function, implemented by the
> +  compiler and the C runtime (e.g., libc).
> +
> +  In this release of DPDK, the handcrafted, per-architecture memory
> +  copy implementations are still the default. Compiler memcpy is
> +  enabled by setting the new ``use_cc_memcpy`` build option to true.
> +
> +  The performance benefits of the custom DPDK rte_memcpy()
> +  implementations have been diminishing with every new compiler
> +  release, and with current toolchains the use of a custom memcpy()
> +  implementation may even result in worse performance than the
> +  standard memcpy().
> +
> +  An additional benefit of using compiler memcpy is that compilers and
> +  static analysis tools have an easier time detecting incorrect usage
> +  of rte_memcpy() (e.g., buffer overruns, or overlapping source and
> +  destination buffers).

As explained in the RN comments, an entry should use the form:

   * **Add a title in the past tense with a full stop.**

     Add a short 1-2 sentence description in the past tense.
     The description should be enough to allow someone scanning
     the release notes to understand the new feature.

It seems this note is a copy/paste of the commit log, please adjust
the title and make the description shorter.

>
>  Removed Items
>  -------------

[snip]

> diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
> index e7f0f8eaa9..cfb0175bd2 100644
> --- a/lib/eal/include/generic/rte_memcpy.h
> +++ b/lib/eal/include/generic/rte_memcpy.h
> @@ -5,12 +5,19 @@
>  #ifndef _RTE_MEMCPY_H_
>  #define _RTE_MEMCPY_H_
>
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
>  /**
>   * @file
>   *
>   * Functions for vectorised implementation of memcpy().
>   */
>
> +#include <stdint.h>
> +#include <string.h>

I don't think those includes should go in a extern "C" { block.

> +
>  /**
>   * Copy 16 bytes from one location to another using optimised
>   * instructions. The locations should not overlap.
> @@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
>  static inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src);
>
> -#ifdef __DOXYGEN__
> -

This strange check was added as not all architectures provide
rte_mov48 (/me slaps Adrien and Thomas).
I think the CI reported no issue because of a problem in the next
patch where all that is tested is RTE_USE_CC_MEMCPY = true
combination.

Still, the overall goal of this work is to drop the whole rte_memcpy
thing in the future, so I think we can live with this #ifdef
__DOXYGEN__ non sense hiding the absence of rte_mov48 in x86...


>  /**
>   * Copy 48 bytes from one location to another using optimised
>   * instructions. The locations should not overlap.
> @@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
>  static inline void
>  rte_mov48(uint8_t *dst, const uint8_t *src);
>
> -#endif /* __DOXYGEN__ */
> -
>  /**
>   * Copy 64 bytes from one location to another using optimised
>   * instructions. The locations should not overlap.
> @@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
>  static inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src);
>
> -#ifdef __DOXYGEN__
> -
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> @@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
>  static void *
>  rte_memcpy(void *dst, const void *src, size_t n);
>
> -#endif /* __DOXYGEN__ */

Removing this DOXYGEN here should be ok.
CI will tell us.


> diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
> index 52d2f8e969..09c2fe2485 100644
> --- a/lib/eal/x86/include/meson.build
> +++ b/lib/eal/x86/include/meson.build
> @@ -16,6 +16,7 @@ arch_headers = files(
>          'rte_spinlock.h',
>          'rte_vect.h',
>  )
> +

Unrelated change.


>  arch_indirect_headers = files(
>          'rte_atomic_32.h',
>          'rte_atomic_64.h',


-- 
David Marchand


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 6/7] ci: test compiler memcpy
  2024-09-20 10:27                           ` [PATCH v6 6/7] ci: test compiler memcpy Mattias Rönnblom
@ 2024-10-04  7:56                             ` David Marchand
  2024-10-09 21:04                             ` Morten Brørup
  1 sibling, 0 replies; 128+ messages in thread
From: David Marchand @ 2024-10-04  7:56 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, Mattias Rönnblom, Morten Brørup,
	Stephen Hemminger, Pavan Nikhilesh, Bruce Richardson

On Fri, Sep 20, 2024 at 12:36 PM Mattias Rönnblom
<mattias.ronnblom@ericsson.com> wrote:
>
> Add compilation tests for the use_cc_memcpy build option.
>
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  .ci/linux-build.sh            | 5 +++++
>  .github/workflows/build.yml   | 7 +++++++
>  devtools/test-meson-builds.sh | 4 +++-
>  3 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh
> index 15ed51e4c1..a873f83d09 100755
> --- a/.ci/linux-build.sh
> +++ b/.ci/linux-build.sh
> @@ -98,6 +98,11 @@ if [ "$STDATOMIC" = "true" ]; then
>  else
>         OPTS="$OPTS -Dcheck_includes=true"
>  fi
> +if [ "$CCMEMCPY" = "true" ]; then
> +       OPTS="$OPTS -Duse_cc_memcpy=true"
> +else
> +       OPTS="$OPTS -Duse_cc_memcpy=true"
> +fi

^^
We only test the true value here.

+       OPTS="$OPTS -Duse_cc_memcpy=$CCMEMCPY"

>  if [ "$MINI" = "true" ]; then
>      OPTS="$OPTS -Denable_drivers=net/null"
>      OPTS="$OPTS -Ddisable_libs=*"


-- 
David Marchand


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE
  2024-10-04  7:52                             ` David Marchand
@ 2024-10-04  9:21                               ` Mattias Rönnblom
  2024-10-04  9:54                                 ` David Marchand
  2024-10-04  9:27                               ` Mattias Rönnblom
  1 sibling, 1 reply; 128+ messages in thread
From: Mattias Rönnblom @ 2024-10-04  9:21 UTC (permalink / raw)
  To: David Marchand, Mattias Rönnblom
  Cc: dev, Morten Brørup, Stephen Hemminger, Pavan Nikhilesh,
	Bruce Richardson

On 2024-10-04 09:52, David Marchand wrote:
> On Fri, Sep 20, 2024 at 12:36 PM Mattias Rönnblom
> <mattias.ronnblom@ericsson.com> wrote:
>>
>> Provide build option to have functions in <rte_memcpy.h> delegate to
>> the standard compiler/libc memcpy(), instead of using the various
>> custom DPDK, handcrafted, per-architecture rte_memcpy()
>> implementations.
>>
>> A new meson build option 'use_cc_memcpy' is added. By default, the
>> traditional, custom DPDK rte_memcpy() implementation is used.
>>
>> The performance benefits of the custom DPDK rte_memcpy()
>> implementations have been diminishing with every compiler release, and
>> with current toolchains the use of a custom memcpy() implementation
>> may even be a liability.
>>
>> An additional benefit of this change is that compilers and static
>> analysis tools have an easier time detecting incorrect usage of
>> rte_memcpy() (e.g., buffer overruns, or overlapping source and
>> destination buffers).
>>
>> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 
> I like this patch and the direction we are taking: stop reinvent
> memcpy and rely on compiler to optimize it.
> 
> I have some comments on the implementation.
> 
> - When I splitted headers in the early days of dpdk, the intention
> with arch-specific headers in EAL was to have them include the generic
> one, in all cases.
> It seems that, over time, x86 rte_memcpy.h (at least) deviated from
> this and stopped including generic/rte_memcpy.h...
> 
> So in this current patch, I expect every arch specific headers first
> include generic/rte_memcpy.h, regardless of any arch-specific define
> coming from the configuration.
> 
> An additional note on this, ARM32 and ARM64 have their own
> implementation in rte_memcpy_32.h resp. rte_memcpy_64.h, and I would
> check RTE_USE_CC_MEMCPY in each of them rather than in the top as
> ARM32 and ARM64 are like two different arches.
> 
> 
> - Now, looking at what was available for arches so far in DPDK:
> * ARM was relying by default on compiler implementation, with specific
> implementations for ARM32 and ARM64 available (see for more details
> below) => possible values (default first) RTE_USE_CC_MEMCPY = true /
> false
> * loongarch was relying on compiler implementation, with no specific
> implementations, => RTE_USE_CC_MEMCPY = true
> * ppc was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> * risc was relying on compiler implementation, with no specific
> implementations, => RTE_USE_CC_MEMCPY = true
> * x86 was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> 
> We can't get a unified default value for a meson option and keep
> compat for all arches (except maybe introduce a "auto" value).
> 
> Plus, disabling RTE_USE_CC_MEMCPY on loongarch and risc makes no
> sense, as there was never a specific implementation.
> 
> My suggestion is to drop the meson option and instead just set
> RTE_USE_CC_MEMCPY in config/$arch/meson.build.
> Testers / interested users may edit config/$arch/meson.build on their own.
> 

So we've gone from...

"Eliminate DPDK custom per-arch memcpy altogether"
to
"Keep custom memcpy, but make cc memcpy the default"
to
"Keep custom memcpy as the default, but make cc memcpy a build option"
to
"Keep custom memcpy as the default, and have the user modify some 
obscure build file to use cc memcpy"

I seems like the natural next step is just

"Keep the custom memcpy. Period."

If we intend to keep the custom DPDK memcpy implementations 
indefinitely, we should just provide an option to use CC memcpy on x86 
as well, just like on ARM.

That would go against the original intention of this patch set, which 
was to reduce DPDK complexity (and hopefully improve performance as 
well, on average).

> 
> - Additionnally, ARM people have introduced arch-specific
> implementation config options for memcpy in ARM32 resp. ARM64:
> RTE_ARCH_ARM_NEON_MEMCPY resp. RTE_ARCH_ARM64_MEMCPY.
> RTE_USE_CC_MEMCPY can replace those two options (we may keep some
> compat in case someone relied on those defines for arm).
> That removes the need for a RTE_CC_MEMCPY define.
> 
> More comments below:
> 
> [snip]
> 
>> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
>> index 0ff70d9057..8be000294d 100644
>> --- a/doc/guides/rel_notes/release_24_11.rst
>> +++ b/doc/guides/rel_notes/release_24_11.rst
>> @@ -55,6 +55,26 @@ New Features
>>        Also, make sure to start the actual text at the margin.
>>        =======================================================
>>
>> +* **Compiler memcpy replaces custom DPDK implementation.**
>> +
>> +  The memory copy functions of ``<rte_memcpy.h>`` now optionally
>> +  delegates to the standard memcpy() function, implemented by the
>> +  compiler and the C runtime (e.g., libc).
>> +
>> +  In this release of DPDK, the handcrafted, per-architecture memory
>> +  copy implementations are still the default. Compiler memcpy is
>> +  enabled by setting the new ``use_cc_memcpy`` build option to true.
>> +
>> +  The performance benefits of the custom DPDK rte_memcpy()
>> +  implementations have been diminishing with every new compiler
>> +  release, and with current toolchains the use of a custom memcpy()
>> +  implementation may even result in worse performance than the
>> +  standard memcpy().
>> +
>> +  An additional benefit of using compiler memcpy is that compilers and
>> +  static analysis tools have an easier time detecting incorrect usage
>> +  of rte_memcpy() (e.g., buffer overruns, or overlapping source and
>> +  destination buffers).
> 
> As explained in the RN comments, an entry should use the form:
> 
>     * **Add a title in the past tense with a full stop.**
> 
>       Add a short 1-2 sentence description in the past tense.
>       The description should be enough to allow someone scanning
>       the release notes to understand the new feature.
> 
> It seems this note is a copy/paste of the commit log, please adjust
> the title and make the description shorter.
> 
>>
>>   Removed Items
>>   -------------
> 
> [snip]
> 
>> diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
>> index e7f0f8eaa9..cfb0175bd2 100644
>> --- a/lib/eal/include/generic/rte_memcpy.h
>> +++ b/lib/eal/include/generic/rte_memcpy.h
>> @@ -5,12 +5,19 @@
>>   #ifndef _RTE_MEMCPY_H_
>>   #define _RTE_MEMCPY_H_
>>
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>>   /**
>>    * @file
>>    *
>>    * Functions for vectorised implementation of memcpy().
>>    */
>>
>> +#include <stdint.h>
>> +#include <string.h>
> 
> I don't think those includes should go in a extern "C" { block.
> 
>> +
>>   /**
>>    * Copy 16 bytes from one location to another using optimised
>>    * instructions. The locations should not overlap.
>> @@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
>>   static inline void
>>   rte_mov32(uint8_t *dst, const uint8_t *src);
>>
>> -#ifdef __DOXYGEN__
>> -
> 
> This strange check was added as not all architectures provide
> rte_mov48 (/me slaps Adrien and Thomas).
> I think the CI reported no issue because of a problem in the next
> patch where all that is tested is RTE_USE_CC_MEMCPY = true
> combination.
> 
> Still, the overall goal of this work is to drop the whole rte_memcpy
> thing in the future, so I think we can live with this #ifdef
> __DOXYGEN__ non sense hiding the absence of rte_mov48 in x86...
> 
> 
>>   /**
>>    * Copy 48 bytes from one location to another using optimised
>>    * instructions. The locations should not overlap.
>> @@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
>>   static inline void
>>   rte_mov48(uint8_t *dst, const uint8_t *src);
>>
>> -#endif /* __DOXYGEN__ */
>> -
>>   /**
>>    * Copy 64 bytes from one location to another using optimised
>>    * instructions. The locations should not overlap.
>> @@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
>>   static inline void
>>   rte_mov256(uint8_t *dst, const uint8_t *src);
>>
>> -#ifdef __DOXYGEN__
>> -
>>   /**
>>    * Copy bytes from one location to another. The locations must not overlap.
>>    *
>> @@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
>>   static void *
>>   rte_memcpy(void *dst, const void *src, size_t n);
>>
>> -#endif /* __DOXYGEN__ */
> 
> Removing this DOXYGEN here should be ok.
> CI will tell us.
> 
> 
>> diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
>> index 52d2f8e969..09c2fe2485 100644
>> --- a/lib/eal/x86/include/meson.build
>> +++ b/lib/eal/x86/include/meson.build
>> @@ -16,6 +16,7 @@ arch_headers = files(
>>           'rte_spinlock.h',
>>           'rte_vect.h',
>>   )
>> +
> 
> Unrelated change.
> 
> 
>>   arch_indirect_headers = files(
>>           'rte_atomic_32.h',
>>           'rte_atomic_64.h',
> 
> 


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE
  2024-10-04  7:52                             ` David Marchand
  2024-10-04  9:21                               ` Mattias Rönnblom
@ 2024-10-04  9:27                               ` Mattias Rönnblom
  1 sibling, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-10-04  9:27 UTC (permalink / raw)
  To: David Marchand, Mattias Rönnblom
  Cc: dev, Morten Brørup, Stephen Hemminger, Pavan Nikhilesh,
	Bruce Richardson

On 2024-10-04 09:52, David Marchand wrote:
> On Fri, Sep 20, 2024 at 12:36 PM Mattias Rönnblom
> <mattias.ronnblom@ericsson.com> wrote:
>>
>> Provide build option to have functions in <rte_memcpy.h> delegate to
>> the standard compiler/libc memcpy(), instead of using the various
>> custom DPDK, handcrafted, per-architecture rte_memcpy()
>> implementations.
>>
>> A new meson build option 'use_cc_memcpy' is added. By default, the
>> traditional, custom DPDK rte_memcpy() implementation is used.
>>
>> The performance benefits of the custom DPDK rte_memcpy()
>> implementations have been diminishing with every compiler release, and
>> with current toolchains the use of a custom memcpy() implementation
>> may even be a liability.
>>
>> An additional benefit of this change is that compilers and static
>> analysis tools have an easier time detecting incorrect usage of
>> rte_memcpy() (e.g., buffer overruns, or overlapping source and
>> destination buffers).
>>
>> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 
> I like this patch and the direction we are taking: stop reinvent
> memcpy and rely on compiler to optimize it.
> 
> I have some comments on the implementation.
> 
> - When I splitted headers in the early days of dpdk, the intention
> with arch-specific headers in EAL was to have them include the generic
> one, in all cases.
> It seems that, over time, x86 rte_memcpy.h (at least) deviated from
> this and stopped including generic/rte_memcpy.h...
> 
> So in this current patch, I expect every arch specific headers first
> include generic/rte_memcpy.h, regardless of any arch-specific define
> coming from the configuration.
> 
> An additional note on this, ARM32 and ARM64 have their own
> implementation in rte_memcpy_32.h resp. rte_memcpy_64.h, and I would
> check RTE_USE_CC_MEMCPY in each of them rather than in the top as
> ARM32 and ARM64 are like two different arches.
> 
> 
> - Now, looking at what was available for arches so far in DPDK:
> * ARM was relying by default on compiler implementation, with specific
> implementations for ARM32 and ARM64 available (see for more details
> below) => possible values (default first) RTE_USE_CC_MEMCPY = true /
> false
> * loongarch was relying on compiler implementation, with no specific
> implementations, => RTE_USE_CC_MEMCPY = true
> * ppc was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> * risc was relying on compiler implementation, with no specific
> implementations, => RTE_USE_CC_MEMCPY = true
> * x86 was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> 
> We can't get a unified default value for a meson option and keep
> compat for all arches (except maybe introduce a "auto" value).
> 

What if you just renamed RTE_USE_CC_MEMCPY to
RTE_ALWAYS_USE_CC_MEMCPY
RTE_FORCE_CC_MEMCPY

Then the naming would better match a scenario where cc memcpy may be the 
only option.

> Plus, disabling RTE_USE_CC_MEMCPY on loongarch and risc makes no
> sense, as there was never a specific implementation.
> 
> My suggestion is to drop the meson option and instead just set
> RTE_USE_CC_MEMCPY in config/$arch/meson.build.
> Testers / interested users may edit config/$arch/meson.build on their own.
> 
> 
> - Additionnally, ARM people have introduced arch-specific
> implementation config options for memcpy in ARM32 resp. ARM64:
> RTE_ARCH_ARM_NEON_MEMCPY resp. RTE_ARCH_ARM64_MEMCPY.
> RTE_USE_CC_MEMCPY can replace those two options (we may keep some
> compat in case someone relied on those defines for arm).
> That removes the need for a RTE_CC_MEMCPY define.
> 
> More comments below:
> 
> [snip]
> 
>> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
>> index 0ff70d9057..8be000294d 100644
>> --- a/doc/guides/rel_notes/release_24_11.rst
>> +++ b/doc/guides/rel_notes/release_24_11.rst
>> @@ -55,6 +55,26 @@ New Features
>>        Also, make sure to start the actual text at the margin.
>>        =======================================================
>>
>> +* **Compiler memcpy replaces custom DPDK implementation.**
>> +
>> +  The memory copy functions of ``<rte_memcpy.h>`` now optionally
>> +  delegates to the standard memcpy() function, implemented by the
>> +  compiler and the C runtime (e.g., libc).
>> +
>> +  In this release of DPDK, the handcrafted, per-architecture memory
>> +  copy implementations are still the default. Compiler memcpy is
>> +  enabled by setting the new ``use_cc_memcpy`` build option to true.
>> +
>> +  The performance benefits of the custom DPDK rte_memcpy()
>> +  implementations have been diminishing with every new compiler
>> +  release, and with current toolchains the use of a custom memcpy()
>> +  implementation may even result in worse performance than the
>> +  standard memcpy().
>> +
>> +  An additional benefit of using compiler memcpy is that compilers and
>> +  static analysis tools have an easier time detecting incorrect usage
>> +  of rte_memcpy() (e.g., buffer overruns, or overlapping source and
>> +  destination buffers).
> 
> As explained in the RN comments, an entry should use the form:
> 
>     * **Add a title in the past tense with a full stop.**
> 
>       Add a short 1-2 sentence description in the past tense.
>       The description should be enough to allow someone scanning
>       the release notes to understand the new feature.
> 
> It seems this note is a copy/paste of the commit log, please adjust
> the title and make the description shorter.
> 
>>
>>   Removed Items
>>   -------------
> 
> [snip]
> 
>> diff --git a/lib/eal/include/generic/rte_memcpy.h b/lib/eal/include/generic/rte_memcpy.h
>> index e7f0f8eaa9..cfb0175bd2 100644
>> --- a/lib/eal/include/generic/rte_memcpy.h
>> +++ b/lib/eal/include/generic/rte_memcpy.h
>> @@ -5,12 +5,19 @@
>>   #ifndef _RTE_MEMCPY_H_
>>   #define _RTE_MEMCPY_H_
>>
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>>   /**
>>    * @file
>>    *
>>    * Functions for vectorised implementation of memcpy().
>>    */
>>
>> +#include <stdint.h>
>> +#include <string.h>
> 
> I don't think those includes should go in a extern "C" { block.
> 
>> +
>>   /**
>>    * Copy 16 bytes from one location to another using optimised
>>    * instructions. The locations should not overlap.
>> @@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
>>   static inline void
>>   rte_mov32(uint8_t *dst, const uint8_t *src);
>>
>> -#ifdef __DOXYGEN__
>> -
> 
> This strange check was added as not all architectures provide
> rte_mov48 (/me slaps Adrien and Thomas).
> I think the CI reported no issue because of a problem in the next
> patch where all that is tested is RTE_USE_CC_MEMCPY = true
> combination.
> 
> Still, the overall goal of this work is to drop the whole rte_memcpy
> thing in the future, so I think we can live with this #ifdef
> __DOXYGEN__ non sense hiding the absence of rte_mov48 in x86...
> 
> 
>>   /**
>>    * Copy 48 bytes from one location to another using optimised
>>    * instructions. The locations should not overlap.
>> @@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
>>   static inline void
>>   rte_mov48(uint8_t *dst, const uint8_t *src);
>>
>> -#endif /* __DOXYGEN__ */
>> -
>>   /**
>>    * Copy 64 bytes from one location to another using optimised
>>    * instructions. The locations should not overlap.
>> @@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
>>   static inline void
>>   rte_mov256(uint8_t *dst, const uint8_t *src);
>>
>> -#ifdef __DOXYGEN__
>> -
>>   /**
>>    * Copy bytes from one location to another. The locations must not overlap.
>>    *
>> @@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
>>   static void *
>>   rte_memcpy(void *dst, const void *src, size_t n);
>>
>> -#endif /* __DOXYGEN__ */
> 
> Removing this DOXYGEN here should be ok.
> CI will tell us.
> 
> 
>> diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
>> index 52d2f8e969..09c2fe2485 100644
>> --- a/lib/eal/x86/include/meson.build
>> +++ b/lib/eal/x86/include/meson.build
>> @@ -16,6 +16,7 @@ arch_headers = files(
>>           'rte_spinlock.h',
>>           'rte_vect.h',
>>   )
>> +
> 
> Unrelated change.
> 
> 
>>   arch_indirect_headers = files(
>>           'rte_atomic_32.h',
>>           'rte_atomic_64.h',
> 
> 


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE
  2024-10-04  9:21                               ` Mattias Rönnblom
@ 2024-10-04  9:54                                 ` David Marchand
  2024-10-04 12:07                                   ` Thomas Monjalon
  0 siblings, 1 reply; 128+ messages in thread
From: David Marchand @ 2024-10-04  9:54 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Mattias Rönnblom, dev, Morten Brørup,
	Stephen Hemminger, Pavan Nikhilesh, Bruce Richardson,
	Thomas Monjalon

On Fri, Oct 4, 2024 at 11:21 AM Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> > - Now, looking at what was available for arches so far in DPDK:
> > * ARM was relying by default on compiler implementation, with specific
> > implementations for ARM32 and ARM64 available (see for more details
> > below) => possible values (default first) RTE_USE_CC_MEMCPY = true /
> > false
> > * loongarch was relying on compiler implementation, with no specific
> > implementations, => RTE_USE_CC_MEMCPY = true
> > * ppc was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> > * risc was relying on compiler implementation, with no specific
> > implementations, => RTE_USE_CC_MEMCPY = true
> > * x86 was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> >
> > We can't get a unified default value for a meson option and keep
> > compat for all arches (except maybe introduce a "auto" value).
> >
> > Plus, disabling RTE_USE_CC_MEMCPY on loongarch and risc makes no
> > sense, as there was never a specific implementation.
> >
> > My suggestion is to drop the meson option and instead just set
> > RTE_USE_CC_MEMCPY in config/$arch/meson.build.
> > Testers / interested users may edit config/$arch/meson.build on their own.
> >
>
> So we've gone from...
>
> "Eliminate DPDK custom per-arch memcpy altogether"
> to
> "Keep custom memcpy, but make cc memcpy the default"
> to
> "Keep custom memcpy as the default, but make cc memcpy a build option"
> to
> "Keep custom memcpy as the default, and have the user modify some
> obscure build file to use cc memcpy"
>
> I seems like the natural next step is just
>
> "Keep the custom memcpy. Period."

Well, the current implementation has holes, that I tried to list so we
can move forward.

About adding a meson option, we try to have as less of them as
possible to reduce complexity.
And this is why an obscure option is probably the best so that
performance tests can be run with the compiler, without breaking
existing users.


If what I replied is irrelevant to others, well, I will let others
review _*in* *depth*_ and Thomas can merge the series.

Thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE
  2024-10-04  9:54                                 ` David Marchand
@ 2024-10-04 12:07                                   ` Thomas Monjalon
  0 siblings, 0 replies; 128+ messages in thread
From: Thomas Monjalon @ 2024-10-04 12:07 UTC (permalink / raw)
  To: Mattias Rönnblom, Mattias Rönnblom
  Cc: dev, Morten Brørup, Stephen Hemminger, Pavan Nikhilesh,
	Bruce Richardson, David Marchand

04/10/2024 11:54, David Marchand:
> On Fri, Oct 4, 2024 at 11:21 AM Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> > > - Now, looking at what was available for arches so far in DPDK:
> > > * ARM was relying by default on compiler implementation, with specific
> > > implementations for ARM32 and ARM64 available (see for more details
> > > below) => possible values (default first) RTE_USE_CC_MEMCPY = true /
> > > false
> > > * loongarch was relying on compiler implementation, with no specific
> > > implementations, => RTE_USE_CC_MEMCPY = true
> > > * ppc was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> > > * risc was relying on compiler implementation, with no specific
> > > implementations, => RTE_USE_CC_MEMCPY = true
> > > * x86 was relying on arch specific implementation, => RTE_USE_CC_MEMCPY = false
> > >
> > > We can't get a unified default value for a meson option and keep
> > > compat for all arches (except maybe introduce a "auto" value).
> > >
> > > Plus, disabling RTE_USE_CC_MEMCPY on loongarch and risc makes no
> > > sense, as there was never a specific implementation.
> > >
> > > My suggestion is to drop the meson option and instead just set
> > > RTE_USE_CC_MEMCPY in config/$arch/meson.build.
> > > Testers / interested users may edit config/$arch/meson.build on their own.
> > >
> >
> > So we've gone from...
> >
> > "Eliminate DPDK custom per-arch memcpy altogether"
> > to
> > "Keep custom memcpy, but make cc memcpy the default"
> > to
> > "Keep custom memcpy as the default, but make cc memcpy a build option"
> > to
> > "Keep custom memcpy as the default, and have the user modify some
> > obscure build file to use cc memcpy"
> >
> > I seems like the natural next step is just
> >
> > "Keep the custom memcpy. Period."
> 
> Well, the current implementation has holes, that I tried to list so we
> can move forward.
> 
> About adding a meson option, we try to have as less of them as
> possible to reduce complexity.
> And this is why an obscure option is probably the best so that
> performance tests can be run with the compiler, without breaking
> existing users.
> 
> 
> If what I replied is irrelevant to others, well, I will let others
> review _*in* *depth*_ and Thomas can merge the series.

No I won't merge it as is.

Mattias, please take a breath and think again about the comments David did.

We support the move to CC memcpy.
The CC memcpy was already used for some arches.
We agree having it as an option for all arches is a good step forward.
Having a compilation define to do some benchmark on any arch is good.
When it will become good enough, we will enable it on more arches by default.

I think it looks very reasonable, we just ask you to prepare the future
without breaking anything for now.
Then it will be very simple patches to decide enabling it more.
And because we want CC memcpy to become the only option,
it does not make sense to introduce a new option for the user.



^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs
  2024-09-20 10:27                           ` [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
@ 2024-10-09 20:59                             ` Morten Brørup
  2024-10-09 22:01                             ` Stephen Hemminger
  1 sibling, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-10-09 20:59 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

Acked-by: Morten Brørup <mb@smartsharesystems.com>



^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v6 2/7] net/octeon_ep: add missing vector API header include
  2024-09-20 10:27                           ` [PATCH v6 2/7] net/octeon_ep: add missing vector API header include Mattias Rönnblom
@ 2024-10-09 21:00                             ` Morten Brørup
  0 siblings, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-10-09 21:00 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v6 3/7] distributor: add missing vector API header include
  2024-09-20 10:27                           ` [PATCH v6 3/7] distributor: " Mattias Rönnblom
@ 2024-10-09 21:00                             ` Morten Brørup
  0 siblings, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-10-09 21:00 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v6 4/7] fib: add missing vector API header include
  2024-09-20 10:27                           ` [PATCH v6 4/7] fib: " Mattias Rönnblom
@ 2024-10-09 21:00                             ` Morten Brørup
  0 siblings, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-10-09 21:00 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v6 6/7] ci: test compiler memcpy
  2024-09-20 10:27                           ` [PATCH v6 6/7] ci: test compiler memcpy Mattias Rönnblom
  2024-10-04  7:56                             ` David Marchand
@ 2024-10-09 21:04                             ` Morten Brørup
  1 sibling, 0 replies; 128+ messages in thread
From: Morten Brørup @ 2024-10-09 21:04 UTC (permalink / raw)
  To: Mattias Rönnblom, dev
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

> From: Mattias Rönnblom [mailto:mattias.ronnblom@ericsson.com]
> Sent: Friday, 20 September 2024 12.27


> +if [ "$CCMEMCPY" = "true" ]; then
> +	OPTS="$OPTS -Duse_cc_memcpy=true"
> +else
> +	OPTS="$OPTS -Duse_cc_memcpy=true"
> +fi

With the bug (only testing true) found by David fixed,
Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* RE: [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
  2024-09-20 10:27                           ` [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
  2024-10-03 11:46                             ` Maxime Coquelin
@ 2024-10-09 21:25                             ` Morten Brørup
  2024-10-10 10:29                               ` Mattias Rönnblom
  2024-10-09 21:57                             ` Stephen Hemminger
  2 siblings, 1 reply; 128+ messages in thread
From: Morten Brørup @ 2024-10-09 21:25 UTC (permalink / raw)
  To: Mattias Rönnblom, dev, maxime.coquelin
  Cc: Mattias Rönnblom, Stephen Hemminger, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{

A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.

> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);
> +
> +	if (len <= 256) {
> +		size_t left;
> +
> +		for (left = len; left >= 32; left -= 32) {
> +			memcpy(dst, src, 32);
> +			dst = RTE_PTR_ADD(dst, 32);
> +			src = RTE_PTR_ADD(src, 32);
> +		}
> +
> +		memcpy(dst, src, left);
> +	} else
> +		memcpy(dst, src, len);
> +}
> +#else
> +static __rte_always_inline void
> +pktcpy(void *dst, const void *src, size_t len)
> +{
> +	rte_memcpy(dst, src, len);
> +}
> +#endif
> +
>  static inline void
>  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> *vq)
>  	__rte_shared_locks_required(&vq->iotlb_lock)
> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++) {
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>  		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>  					   elem[i].len);
>  		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++)
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> 
>  	vq->batch_copy_nb_elems = 0;
>  }
> --
> 2.43.0

Anyway,
Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
  2024-09-20 10:27                           ` [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
  2024-10-03 11:46                             ` Maxime Coquelin
  2024-10-09 21:25                             ` Morten Brørup
@ 2024-10-09 21:57                             ` Stephen Hemminger
  2024-10-10 10:35                               ` Mattias Rönnblom
  2 siblings, 1 reply; 128+ messages in thread
From: Stephen Hemminger @ 2024-10-09 21:57 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, Mattias Rönnblom, Morten Brørup, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

On Fri, 20 Sep 2024 12:27:16 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:

> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{
> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);

Not sure if buffer is really aligned that way but x86 doesn't care.

Since src and dst can be pointers into mbuf at an offset.
The offset will be a multiple of the buffer len.

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs
  2024-09-20 10:27                           ` [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
  2024-10-09 20:59                             ` Morten Brørup
@ 2024-10-09 22:01                             ` Stephen Hemminger
  1 sibling, 0 replies; 128+ messages in thread
From: Stephen Hemminger @ 2024-10-09 22:01 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: dev, Mattias Rönnblom, Morten Brørup, David Marchand,
	Pavan Nikhilesh, Bruce Richardson

On Fri, 20 Sep 2024 12:27:10 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:

> The DLB2 PMD depended on <rte_vect.h> being included as a side-effect
> of <rte_memcpy.h> being included.
> 
> In addition, DLB2 used rte_memcpy() but did not include <rte_memcpy.h>,
> but rather depended on other include files to do so.
> 
> This patch addresses both of those issues.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Have to ask, why is it using rte_memcpy when it could just assignment.
Assignment is better since it keeps type safety.

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index c43ab864ca..51870486ed 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1668,7 +1668,7 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 	qm_port->issued_releases = 0;
 
 	/* Save config message too. */
-	rte_memcpy(&qm_port->cfg.ldb, &cfg, sizeof(qm_port->cfg.ldb));
+	qm_port->cfg.ldb = cfg;
 
 	/* update state */
 	qm_port->state = PORT_STARTED; /* enabled at create time */
@@ -1869,7 +1869,7 @@ dlb2_hw_create_dir_port(struct dlb2_eventdev *dlb2,
 	qm_port->issued_releases = 0;
 
 	/* Save config message too. */
-	rte_memcpy(&qm_port->cfg.dir, &cfg, sizeof(qm_port->cfg.dir));
+	qm_port->cfg.dir = cfg;
 
 	/* update state */
 	qm_port->state = PORT_STARTED; /* enabled at create time */

^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
  2024-10-09 21:25                             ` Morten Brørup
@ 2024-10-10 10:29                               ` Mattias Rönnblom
  0 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-10-10 10:29 UTC (permalink / raw)
  To: Morten Brørup, Mattias Rönnblom, dev, maxime.coquelin
  Cc: Stephen Hemminger, David Marchand, Pavan Nikhilesh, Bruce Richardson

On 2024-10-09 23:25, Morten Brørup wrote:
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
> 
> A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.
> 

Good point. As I think I mentioned at some point, I'm not sure they are.

 From what I recall, having (or pretending) the data is 16-bit aligned 
does give a noticeable performance increase on x86_64.

Is this something I should look into for 24.11, or this patch set is not 
going to make it anyway?

>> +	void *dst = __builtin_assume_aligned(in_dst, 16);
>> +	const void *src = __builtin_assume_aligned(in_src, 16);
>> +
>> +	if (len <= 256) {
>> +		size_t left;
>> +
>> +		for (left = len; left >= 32; left -= 32) {
>> +			memcpy(dst, src, 32);
>> +			dst = RTE_PTR_ADD(dst, 32);
>> +			src = RTE_PTR_ADD(src, 32);
>> +		}
>> +
>> +		memcpy(dst, src, left);
>> +	} else
>> +		memcpy(dst, src, len);
>> +}
>> +#else
>> +static __rte_always_inline void
>> +pktcpy(void *dst, const void *src, size_t len)
>> +{
>> +	rte_memcpy(dst, src, len);
>> +}
>> +#endif
>> +
>>   static inline void
>>   do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
>> *vq)
>>   	__rte_shared_locks_required(&vq->iotlb_lock)
>> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
>> vhost_virtqueue *vq)
>>   	int i;
>>
>>   	for (i = 0; i < count; i++) {
>> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
>> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>>   		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>>   					   elem[i].len);
>>   		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
>> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>>   	int i;
>>
>>   	for (i = 0; i < count; i++)
>> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
>> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>>
>>   	vq->batch_copy_nb_elems = 0;
>>   }
>> --
>> 2.43.0
> 
> Anyway,
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 


^ permalink raw reply	[flat|nested] 128+ messages in thread

* Re: [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
  2024-10-09 21:57                             ` Stephen Hemminger
@ 2024-10-10 10:35                               ` Mattias Rönnblom
  0 siblings, 0 replies; 128+ messages in thread
From: Mattias Rönnblom @ 2024-10-10 10:35 UTC (permalink / raw)
  To: Stephen Hemminger, Mattias Rönnblom
  Cc: dev, Morten Brørup, David Marchand, Pavan Nikhilesh,
	Bruce Richardson

On 2024-10-09 23:57, Stephen Hemminger wrote:
> On Fri, 20 Sep 2024 12:27:16 +0200
> Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:
> 
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
>> +	void *dst = __builtin_assume_aligned(in_dst, 16);
>> +	const void *src = __builtin_assume_aligned(in_src, 16);
> 
> Not sure if buffer is really aligned that way but x86 doesn't care.
> 

I think it might care, actually. That's why this makes a difference. 
With 16-byte alignment assumed, the compiler may use MOVDQA, otherwise, 
it can't and must use MOVDQU. Generally these things doesn't matter from 
a performance point of view in my experience, but it this case it did 
(in my benchmark, on my CPU, with my compiler etc).

> Since src and dst can be pointers into mbuf at an offset.
> The offset will be a multiple of the buffer len.


^ permalink raw reply	[flat|nested] 128+ messages in thread

end of thread, other threads:[~2024-10-10 10:35 UTC | newest]

Thread overview: 128+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-27 11:11 [RFC] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-05-28  7:43 ` [RFC v2] " Mattias Rönnblom
2024-05-28  8:19   ` Mattias Rönnblom
2024-05-28  8:27     ` Bruce Richardson
2024-05-28  8:59       ` Mattias Rönnblom
2024-05-28  9:07         ` Morten Brørup
2024-05-28 16:17           ` Mattias Rönnblom
2024-05-28 14:59     ` Stephen Hemminger
2024-05-28 15:09       ` Bruce Richardson
2024-05-31  5:19         ` Mattias Rönnblom
2024-05-31 16:50           ` Stephen Hemminger
2024-06-02 11:33             ` Mattias Rönnblom
2024-05-28 16:03       ` Mattias Rönnblom
2024-05-29 21:55         ` Stephen Hemminger
2024-05-28  8:20   ` Bruce Richardson
2024-06-02 12:39   ` [RFC v3 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-06-02 12:39     ` [RFC v3 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
2024-06-05  6:49       ` [PATCH 0/5] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-06-05  6:49         ` [PATCH 1/5] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
2024-06-05  6:49         ` [PATCH 2/5] net/octeon_ep: properly include vector API header file Mattias Rönnblom
2024-06-05  6:49         ` [PATCH 3/5] distributor: " Mattias Rönnblom
2024-06-10 14:27           ` Tyler Retzlaff
2024-06-05  6:49         ` [PATCH 4/5] fib: " Mattias Rönnblom
2024-06-10 14:28           ` Tyler Retzlaff
2024-06-05  6:49         ` [PATCH 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-06-20  7:24         ` [PATCH v2 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-06-20  7:24           ` [PATCH v2 1/6] net/fm10k: add missing intrinsic include Mattias Rönnblom
2024-06-20  9:02             ` Bruce Richardson
2024-06-20  9:28             ` Bruce Richardson
2024-06-20 11:40               ` Mattias Rönnblom
2024-06-20 11:59                 ` Bruce Richardson
2024-06-20 11:50             ` [PATCH v3 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-06-20 11:50               ` [PATCH v3 1/6] net/fm10k: add missing vector API header include Mattias Rönnblom
2024-06-20 12:34                 ` Bruce Richardson
2024-06-20 17:57                 ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 01/13] net/i40e: add missing vector API header include Mattias Rönnblom
2024-07-24  7:53                     ` [PATCH v5 0/6] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-07-24  7:53                       ` [PATCH v5 1/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
2024-09-20 10:27                         ` [PATCH v6 0/7] Optionally have rte_memcpy delegate to compiler memcpy Mattias Rönnblom
2024-09-20 10:27                           ` [PATCH v6 1/7] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
2024-10-09 20:59                             ` Morten Brørup
2024-10-09 22:01                             ` Stephen Hemminger
2024-09-20 10:27                           ` [PATCH v6 2/7] net/octeon_ep: add missing vector API header include Mattias Rönnblom
2024-10-09 21:00                             ` Morten Brørup
2024-09-20 10:27                           ` [PATCH v6 3/7] distributor: " Mattias Rönnblom
2024-10-09 21:00                             ` Morten Brørup
2024-09-20 10:27                           ` [PATCH v6 4/7] fib: " Mattias Rönnblom
2024-10-09 21:00                             ` Morten Brørup
2024-09-20 10:27                           ` [PATCH v6 5/7] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-10-04  7:52                             ` David Marchand
2024-10-04  9:21                               ` Mattias Rönnblom
2024-10-04  9:54                                 ` David Marchand
2024-10-04 12:07                                   ` Thomas Monjalon
2024-10-04  9:27                               ` Mattias Rönnblom
2024-09-20 10:27                           ` [PATCH v6 6/7] ci: test compiler memcpy Mattias Rönnblom
2024-10-04  7:56                             ` David Marchand
2024-10-09 21:04                             ` Morten Brørup
2024-09-20 10:27                           ` [PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
2024-10-03 11:46                             ` Maxime Coquelin
2024-10-09 21:25                             ` Morten Brørup
2024-10-10 10:29                               ` Mattias Rönnblom
2024-10-09 21:57                             ` Stephen Hemminger
2024-10-10 10:35                               ` Mattias Rönnblom
2024-07-24  7:53                       ` [PATCH v5 2/6] distributor: add missing vector API header include Mattias Rönnblom
2024-07-24  7:53                       ` [PATCH v5 3/6] fib: " Mattias Rönnblom
2024-07-24  7:53                       ` [PATCH v5 4/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-07-24  7:53                       ` [PATCH v5 5/6] ci: test compiler memcpy Mattias Rönnblom
2024-07-24  7:53                       ` [PATCH v5 6/6] vhost: optimize memcpy routines when cc memcpy is used Mattias Rönnblom
2024-07-29 11:00                         ` Morten Brørup
2024-07-29 19:27                           ` Mattias Rönnblom
2024-07-29 19:56                             ` Morten Brørup
2024-06-20 17:57                   ` [PATCH v4 02/13] net/iavf: add missing vector API header include Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 03/13] net/ice: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 04/13] net/ixgbe: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 05/13] net/ngbe: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 06/13] net/txgbe: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 07/13] net/virtio: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 08/13] net/fm10k: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 09/13] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 10/13] net/octeon_ep: add missing vector API header include Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 11/13] distributor: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 12/13] fib: " Mattias Rönnblom
2024-06-20 17:57                   ` [PATCH v4 13/13] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-06-21 15:19                     ` Stephen Hemminger
2024-06-24 10:05                     ` Thomas Monjalon
2024-06-24 17:56                       ` Mattias Rönnblom
2024-06-25 13:06                       ` Mattias Rönnblom
2024-06-25 13:34                         ` Thomas Monjalon
2024-06-20 18:53                   ` [PATCH v4 00/13] Optionally have rte_memcpy delegate to compiler memcpy Morten Brørup
2024-06-21  6:56                   ` Mattias Rönnblom
2024-06-21  7:04                     ` David Marchand
2024-06-21  7:35                       ` Mattias Rönnblom
2024-06-21  7:41                         ` David Marchand
2024-06-25 15:29                   ` Maxime Coquelin
2024-06-25 15:44                     ` Stephen Hemminger
2024-06-25 19:27                     ` Mattias Rönnblom
2024-06-26  8:37                       ` Maxime Coquelin
2024-06-26 14:58                         ` Stephen Hemminger
2024-06-26 15:24                           ` Maxime Coquelin
2024-06-26 18:47                             ` Mattias Rönnblom
2024-06-26 20:16                               ` Morten Brørup
2024-06-27 11:06                                 ` Mattias Rönnblom
2024-06-27 15:10                                   ` Stephen Hemminger
2024-06-27 15:23                                     ` Mattias Rönnblom
2024-06-20 11:50               ` [PATCH v3 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
2024-06-20 11:50               ` [PATCH v3 3/6] net/octeon_ep: add missing vector API header include Mattias Rönnblom
2024-06-20 11:50               ` [PATCH v3 4/6] distributor: " Mattias Rönnblom
2024-06-20 11:50               ` [PATCH v3 5/6] fib: " Mattias Rönnblom
2024-06-20 11:50               ` [PATCH v3 6/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-06-20  7:24           ` [PATCH v2 2/6] event/dlb2: include headers for vector and memory copy APIs Mattias Rönnblom
2024-06-20  9:03             ` Bruce Richardson
2024-06-20  7:24           ` [PATCH v2 3/6] net/octeon_ep: properly include vector API header file Mattias Rönnblom
2024-06-20 14:43             ` Stephen Hemminger
2024-06-20  7:24           ` [PATCH v2 4/6] distributor: " Mattias Rönnblom
2024-06-20  9:13             ` Bruce Richardson
2024-06-20  7:24           ` [PATCH v2 5/6] fib: " Mattias Rönnblom
2024-06-20  9:14             ` Bruce Richardson
2024-06-20 14:43               ` Stephen Hemminger
2024-06-20  7:24           ` [PATCH v2 6/6] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-06-02 12:39     ` [RFC v3 2/5] net/octeon_ep: properly include vector API header file Mattias Rönnblom
2024-06-02 12:39     ` [RFC v3 3/5] distributor: " Mattias Rönnblom
2024-06-02 12:39     ` [RFC v3 4/5] fib: " Mattias Rönnblom
2024-06-02 12:39     ` [RFC v3 5/5] eal: provide option to use compiler memcpy instead of RTE Mattias Rönnblom
2024-06-02 20:58       ` Morten Brørup
2024-06-03 17:04         ` Mattias Rönnblom
2024-06-03 17:08           ` Stephen Hemminger
2024-05-29 21:56 ` [RFC] " Stephen Hemminger
2024-06-02 11:30   ` Mattias Rönnblom

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).