* [PATCH v2] eal/x86: optimize memcpy of small sizes
2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
@ 2025-11-21 10:35 ` Morten Brørup
2025-11-21 16:57 ` Stephen Hemminger
2025-11-21 10:40 ` Morten Brørup
2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
2 siblings, 1 reply; 8+ messages in thread
From: Morten Brørup @ 2025-11-21 10:35 UTC (permalink / raw)
To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
Cc: Stephen Hemminger, Morten Brørup
The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.
Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.
These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.
And finally, the missing implementation of rte_mov48() was added.
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
lib/eal/x86/include/rte_memcpy.h | 341 ++++++++++++++++---------------
1 file changed, 181 insertions(+), 160 deletions(-)
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..665902ff62 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
- /**
- * Use the following structs to avoid violating C standard
- * alignment requirements and to avoid strict aliasing bugs
- */
- struct __rte_packed_begin rte_uint64_alias {
- uint64_t val;
- } __rte_packed_end __rte_may_alias;
- struct __rte_packed_begin rte_uint32_alias {
- uint32_t val;
- } __rte_packed_end __rte_may_alias;
- struct __rte_packed_begin rte_uint16_alias {
- uint16_t val;
- } __rte_packed_end __rte_may_alias;
-
- void *ret = dst;
- if (n & 8) {
- ((struct rte_uint64_alias *)dst)->val =
- ((const struct rte_uint64_alias *)src)->val;
- src = (const uint64_t *)src + 1;
- dst = (uint64_t *)dst + 1;
- }
- if (n & 4) {
- ((struct rte_uint32_alias *)dst)->val =
- ((const struct rte_uint32_alias *)src)->val;
- src = (const uint32_t *)src + 1;
- dst = (uint32_t *)dst + 1;
- }
- if (n & 2) {
- ((struct rte_uint16_alias *)dst)->val =
- ((const struct rte_uint16_alias *)src)->val;
- src = (const uint16_t *)src + 1;
- dst = (uint16_t *)dst + 1;
- }
- if (n & 1)
- *(uint8_t *)dst = *(const uint8_t *)src;
- return ret;
-}
-
/**
* Copy 16 bytes from one location to another,
* locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
#endif
}
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
rte_mov128(dst + 1 * 128, src + 1 * 128);
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" intructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 2)
+ return memcpy(dst, src, 2);
+ if (n == 4)
+ return memcpy(dst, src, 4);
+ if (n == 6) /* 4 + 2 */
+ return memcpy(dst, src, 6);
+ if (n == 8)
+ return memcpy(dst, src, 8);
+ if (n == 10) /* 8 + 2 */
+ return memcpy(dst, src, 10);
+ if (n == 12) /* 8 + 4 */
+ return memcpy(dst, src, 12);
+ if (n == 16) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ if (n & 0x18) { /* n >= 8 */
+ /* copy 8 ~ 16 bytes */
+ memcpy(dst, src, 8);
+ memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+ } else if (n & 0x4) {
+ /* copy 4 ~ 7 bytes */
+ memcpy(dst, src, 4);
+ memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+ } else if (n & 0x2) {
+ /* copy 2 ~ 3 bytes */
+ memcpy(dst, src, 2);
+ memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+ } else if (n & 0x1) {
+ /* copy 1 byte */
+ memcpy(dst, src, 1);
+ }
+ return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 16) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 18) /* 16 + 2 */
+ return memcpy(dst, src, 18);
+ if (n == 20) /* 16 + 4 */
+ return memcpy(dst, src, 20);
+ if (n == 24) /* 16 + 8 */
+ return memcpy(dst, src, 24);
+ if (n == 32) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ /* copy 17 (or 16) ~ 32 bytes */
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+ return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 32) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 36) /* 32 + 4 */
+ return memcpy(dst, src, 36);
+ if (n == 40) /* 32 + 8 */
+ return memcpy(dst, src, 40);
+ if (n == 48) {
+ rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 64) {
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ /* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ if (n > 48)
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+ return dst;
+}
+
#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
/**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
size_t dstofss;
size_t bits;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (__rte_constant(n) && n == 64) {
- rte_mov64((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
- return ret;
- }
if (n <= 512) {
if (n >= 256) {
n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
size_t dstofss;
size_t bits;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 256 bytes
*/
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
- return ret;
- }
if (n <= 256) {
if (n >= 128) {
n -= 128;
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
} \
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
void *ret = dst;
size_t dstofss;
size_t srcofs;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- if (n > 48)
- rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
- return ret;
- }
if (n <= 128) {
goto COPY_BLOCK_128_BACK15;
}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
#endif /* __AVX512F__ */
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
- /* Copy size < 16 bytes */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
- /* Copy 16 <= size <= 32 bytes */
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
-
- return ret;
- }
-
- /* Copy 32 < size <= 64 bytes */
- if (__rte_constant(n) && n == 64) {
- rte_mov64((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
-
- return ret;
- }
-
/* Copy 64 bytes blocks */
for (; n > 64; n -= 64) {
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n)
{
+ if (n <= 16)
+ return rte_mov16_or_less(dst, src, n);
+ if (n <= 32)
+ return rte_mov17_to_32(dst, src, n);
+ if (n <= 64)
+ return rte_mov33_to_64(dst, src, n);
+
+ /* Implementation for size > 64 bytes depends on alignment with vector register size. */
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
- return rte_memcpy_aligned(dst, src, n);
+ return rte_memcpy_aligned_more_than_64(dst, src, n);
else
- return rte_memcpy_generic(dst, src, n);
+ return rte_memcpy_generic_more_than_64(dst, src, n);
}
#undef ALIGNMENT_MASK
--
2.43.0
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH v2] eal/x86: optimize memcpy of small sizes
2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
@ 2025-11-21 16:57 ` Stephen Hemminger
2025-11-21 17:02 ` Bruce Richardson
0 siblings, 1 reply; 8+ messages in thread
From: Stephen Hemminger @ 2025-11-21 16:57 UTC (permalink / raw)
To: Morten Brørup
Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
On Fri, 21 Nov 2025 10:35:35 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:
> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers, so the code
> handling this was moved from the various implementations to the common
> function.
>
> Furthermore, the function for copying less than 16 bytes was replaced with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes. This small extension reduces the code path for
> copying two pointers.
>
> These changes provide two benefits:
> 1. The memory footprint of the copy function is reduced.
> Previously there were two instances of the compiled code to copy up to 64
> bytes, one in the "aligned" code path, and one in the "generic" code path.
> Now there is only one instance, in the "common" code path.
> 2. The performance for copying up to 64 bytes is improved.
> The memcpy performance test shows cache-to-cache copying of up to 32 bytes
> now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> ca. 6.5 cycles before this patch.
>
> And finally, the missing implementation of rte_mov48() was added.
>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
As I have said before would rather that DPDK move away from having its
own specialized memcpy. How is this compared to stock inline gcc?
The main motivation is that the glibc/gcc team does more testing across
multiple architectures and has a community with more expertise on CPU
special cases.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2] eal/x86: optimize memcpy of small sizes
2025-11-21 16:57 ` Stephen Hemminger
@ 2025-11-21 17:02 ` Bruce Richardson
2025-11-21 17:11 ` Stephen Hemminger
0 siblings, 1 reply; 8+ messages in thread
From: Bruce Richardson @ 2025-11-21 17:02 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Morten Brørup, dev, Konstantin Ananyev, Vipin Varghese
On Fri, Nov 21, 2025 at 08:57:30AM -0800, Stephen Hemminger wrote:
> On Fri, 21 Nov 2025 10:35:35 +0000
> Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > The implementation for copying up to 64 bytes does not depend on address
> > alignment with the size of the CPU's vector registers, so the code
> > handling this was moved from the various implementations to the common
> > function.
> >
> > Furthermore, the function for copying less than 16 bytes was replaced with
> > a smarter implementation using fewer branches and potentially fewer
> > load/store operations.
> > This function was also extended to handle copying of up to 16 bytes,
> > instead of up to 15 bytes. This small extension reduces the code path for
> > copying two pointers.
> >
> > These changes provide two benefits:
> > 1. The memory footprint of the copy function is reduced.
> > Previously there were two instances of the compiled code to copy up to 64
> > bytes, one in the "aligned" code path, and one in the "generic" code path.
> > Now there is only one instance, in the "common" code path.
> > 2. The performance for copying up to 64 bytes is improved.
> > The memcpy performance test shows cache-to-cache copying of up to 32 bytes
> > now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> > ca. 6.5 cycles before this patch.
> >
> > And finally, the missing implementation of rte_mov48() was added.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>
> As I have said before would rather that DPDK move away from having its
> own specialized memcpy. How is this compared to stock inline gcc?
> The main motivation is that the glibc/gcc team does more testing across
> multiple architectures and has a community with more expertise on CPU
> special cases.
I would tend to agree. Even if we get rte_memcpy a few cycles faster, I
suspect many apps wouldn't notice the difference. However, I understand
that the virtio/vhost libraries gain from using rte_memcpy over standard
memcpy - or at least used to. Perhaps we can consider deprecating
rte_memcpy and just putting a vhost-specific memcpy in that library?
/Bruce
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2] eal/x86: optimize memcpy of small sizes
2025-11-21 17:02 ` Bruce Richardson
@ 2025-11-21 17:11 ` Stephen Hemminger
2025-11-21 21:36 ` Morten Brørup
0 siblings, 1 reply; 8+ messages in thread
From: Stephen Hemminger @ 2025-11-21 17:11 UTC (permalink / raw)
To: Bruce Richardson
Cc: Morten Brørup, dev, Konstantin Ananyev, Vipin Varghese
On Fri, 21 Nov 2025 17:02:17 +0000
Bruce Richardson <bruce.richardson@intel.com> wrote:
> > As I have said before would rather that DPDK move away from having its
> > own specialized memcpy. How is this compared to stock inline gcc?
> > The main motivation is that the glibc/gcc team does more testing across
> > multiple architectures and has a community with more expertise on CPU
> > special cases.
>
> I would tend to agree. Even if we get rte_memcpy a few cycles faster, I
> suspect many apps wouldn't notice the difference. However, I understand
> that the virtio/vhost libraries gain from using rte_memcpy over standard
> memcpy - or at least used to. Perhaps we can consider deprecating
> rte_memcpy and just putting a vhost-specific memcpy in that library?
It would be good to figure out why vhost is better with rte_memcpy,
maybe there is some alignment assumption that is in one and not the other?
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2] eal/x86: optimize memcpy of small sizes
2025-11-21 17:11 ` Stephen Hemminger
@ 2025-11-21 21:36 ` Morten Brørup
0 siblings, 0 replies; 8+ messages in thread
From: Morten Brørup @ 2025-11-21 21:36 UTC (permalink / raw)
To: Stephen Hemminger, Bruce Richardson
Cc: dev, Konstantin Ananyev, Vipin Varghese
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Friday, 21 November 2025 18.12
>
> On Fri, 21 Nov 2025 17:02:17 +0000
> Bruce Richardson <bruce.richardson@intel.com> wrote:
>
> > > As I have said before would rather that DPDK move away from having
> its
> > > own specialized memcpy. How is this compared to stock inline gcc?
The "./build/app/test/dpdk-test memcpy_perf_autotest" compares to standard memcpy().
On my build system, copies up to 64 bytes (with size not known at build time) take 9 cycles using memcpy() vs. 2-4 cycles using rte_memcpy().
The general difference was probably worse with older compilers.
We should compare using the oldest compiler versions officially supported by DPDK. (GCC, Clang, MSVC, ...) And across the supported CPUs.
There are plenty of optimizations in DPDK, which were relevant at the time of addition, but have become obsolete over time.
I don't think rte_memcpy() is there yet. (Gut feeling, no data to back it up with!)
Until we get there, we should keep optimizing rte_memcpy().
For any per-packet operation, shaving off a few cycles is valuable.
And if the majority of an application's copy operations per packet are more than a few bytes, the application will not achieve high performance.
Thus, I think optimizing small copies is relevant: A normal DPDK application should perform many more small copies than large copies. (Measured by number of copy operations, not number of copied bytes.)
> > > The main motivation is that the glibc/gcc team does more testing
> across
> > > multiple architectures and has a community with more expertise on
> CPU
> > > special cases.
> >
> > I would tend to agree. Even if we get rte_memcpy a few cycles faster,
> I
> > suspect many apps wouldn't notice the difference. However, I
> understand
> > that the virtio/vhost libraries gain from using rte_memcpy over
> standard
> > memcpy - or at least used to. Perhaps we can consider deprecating
> > rte_memcpy and just putting a vhost-specific memcpy in that library?
>
> It would be good to figure out why vhost is better with rte_memcpy,
> maybe there is some alignment assumption that is in one and not the
> other?
Looking at 1024 bytes copy on my build system,
cache-to-mem is 12 % faster with rte_memcpy(), and
mem-to-cache is 10 % slower.
Maybe the vhost library would benefit from having access to two rte_memcpy variants, respectively optimized for cache-to-mem and mem-to-cache.
There will always be some use cases where a generic "optimized" rte_memcpy() will be suboptimal.
Providing specific functions optimized for specific use cases makes really good sense.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH v2] eal/x86: optimize memcpy of small sizes
2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
@ 2025-11-21 10:40 ` Morten Brørup
2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
2 siblings, 0 replies; 8+ messages in thread
From: Morten Brørup @ 2025-11-21 10:40 UTC (permalink / raw)
To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
Cc: Stephen Hemminger, Morten Brørup
The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.
Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.
These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.
And finally, the missing implementation of rte_mov48() was added.
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
lib/eal/x86/include/rte_memcpy.h | 341 ++++++++++++++++---------------
1 file changed, 181 insertions(+), 160 deletions(-)
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..665902ff62 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
- /**
- * Use the following structs to avoid violating C standard
- * alignment requirements and to avoid strict aliasing bugs
- */
- struct __rte_packed_begin rte_uint64_alias {
- uint64_t val;
- } __rte_packed_end __rte_may_alias;
- struct __rte_packed_begin rte_uint32_alias {
- uint32_t val;
- } __rte_packed_end __rte_may_alias;
- struct __rte_packed_begin rte_uint16_alias {
- uint16_t val;
- } __rte_packed_end __rte_may_alias;
-
- void *ret = dst;
- if (n & 8) {
- ((struct rte_uint64_alias *)dst)->val =
- ((const struct rte_uint64_alias *)src)->val;
- src = (const uint64_t *)src + 1;
- dst = (uint64_t *)dst + 1;
- }
- if (n & 4) {
- ((struct rte_uint32_alias *)dst)->val =
- ((const struct rte_uint32_alias *)src)->val;
- src = (const uint32_t *)src + 1;
- dst = (uint32_t *)dst + 1;
- }
- if (n & 2) {
- ((struct rte_uint16_alias *)dst)->val =
- ((const struct rte_uint16_alias *)src)->val;
- src = (const uint16_t *)src + 1;
- dst = (uint16_t *)dst + 1;
- }
- if (n & 1)
- *(uint8_t *)dst = *(const uint8_t *)src;
- return ret;
-}
-
/**
* Copy 16 bytes from one location to another,
* locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
#endif
}
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
rte_mov128(dst + 1 * 128, src + 1 * 128);
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 2)
+ return memcpy(dst, src, 2);
+ if (n == 4)
+ return memcpy(dst, src, 4);
+ if (n == 6) /* 4 + 2 */
+ return memcpy(dst, src, 6);
+ if (n == 8)
+ return memcpy(dst, src, 8);
+ if (n == 10) /* 8 + 2 */
+ return memcpy(dst, src, 10);
+ if (n == 12) /* 8 + 4 */
+ return memcpy(dst, src, 12);
+ if (n == 16) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ if (n & 0x18) { /* n >= 8 */
+ /* copy 8 ~ 16 bytes */
+ memcpy(dst, src, 8);
+ memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+ } else if (n & 0x4) {
+ /* copy 4 ~ 7 bytes */
+ memcpy(dst, src, 4);
+ memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+ } else if (n & 0x2) {
+ /* copy 2 ~ 3 bytes */
+ memcpy(dst, src, 2);
+ memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+ } else if (n & 0x1) {
+ /* copy 1 byte */
+ memcpy(dst, src, 1);
+ }
+ return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 16) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 18) /* 16 + 2 */
+ return memcpy(dst, src, 18);
+ if (n == 20) /* 16 + 4 */
+ return memcpy(dst, src, 20);
+ if (n == 24) /* 16 + 8 */
+ return memcpy(dst, src, 24);
+ if (n == 32) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ /* copy 17 (or 16) ~ 32 bytes */
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+ return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 32) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 36) /* 32 + 4 */
+ return memcpy(dst, src, 36);
+ if (n == 40) /* 32 + 8 */
+ return memcpy(dst, src, 40);
+ if (n == 48) {
+ rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 64) {
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ /* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ if (n > 48)
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+ return dst;
+}
+
#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
/**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
size_t dstofss;
size_t bits;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (__rte_constant(n) && n == 64) {
- rte_mov64((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
- return ret;
- }
if (n <= 512) {
if (n >= 256) {
n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
size_t dstofss;
size_t bits;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 256 bytes
*/
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
- return ret;
- }
if (n <= 256) {
if (n >= 128) {
n -= 128;
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
} \
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
void *ret = dst;
size_t dstofss;
size_t srcofs;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- if (n > 48)
- rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
- return ret;
- }
if (n <= 128) {
goto COPY_BLOCK_128_BACK15;
}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
#endif /* __AVX512F__ */
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
- /* Copy size < 16 bytes */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
- /* Copy 16 <= size <= 32 bytes */
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
-
- return ret;
- }
-
- /* Copy 32 < size <= 64 bytes */
- if (__rte_constant(n) && n == 64) {
- rte_mov64((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
-
- return ret;
- }
-
/* Copy 64 bytes blocks */
for (; n > 64; n -= 64) {
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n)
{
+ if (n <= 16)
+ return rte_mov16_or_less(dst, src, n);
+ if (n <= 32)
+ return rte_mov17_to_32(dst, src, n);
+ if (n <= 64)
+ return rte_mov33_to_64(dst, src, n);
+
+ /* Implementation for size > 64 bytes depends on alignment with vector register size. */
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
- return rte_memcpy_aligned(dst, src, n);
+ return rte_memcpy_aligned_more_than_64(dst, src, n);
else
- return rte_memcpy_generic(dst, src, n);
+ return rte_memcpy_generic_more_than_64(dst, src, n);
}
#undef ALIGNMENT_MASK
--
2.43.0
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH v3] eal/x86: optimize memcpy of small sizes
2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
2025-11-21 10:40 ` Morten Brørup
@ 2025-11-21 10:40 ` Morten Brørup
2 siblings, 0 replies; 8+ messages in thread
From: Morten Brørup @ 2025-11-21 10:40 UTC (permalink / raw)
To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
Cc: Stephen Hemminger, Morten Brørup
The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.
Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.
These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.
And finally, the missing implementation of rte_mov48() was added.
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
lib/eal/x86/include/rte_memcpy.h | 341 ++++++++++++++++---------------
1 file changed, 181 insertions(+), 160 deletions(-)
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..665902ff62 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
- /**
- * Use the following structs to avoid violating C standard
- * alignment requirements and to avoid strict aliasing bugs
- */
- struct __rte_packed_begin rte_uint64_alias {
- uint64_t val;
- } __rte_packed_end __rte_may_alias;
- struct __rte_packed_begin rte_uint32_alias {
- uint32_t val;
- } __rte_packed_end __rte_may_alias;
- struct __rte_packed_begin rte_uint16_alias {
- uint16_t val;
- } __rte_packed_end __rte_may_alias;
-
- void *ret = dst;
- if (n & 8) {
- ((struct rte_uint64_alias *)dst)->val =
- ((const struct rte_uint64_alias *)src)->val;
- src = (const uint64_t *)src + 1;
- dst = (uint64_t *)dst + 1;
- }
- if (n & 4) {
- ((struct rte_uint32_alias *)dst)->val =
- ((const struct rte_uint32_alias *)src)->val;
- src = (const uint32_t *)src + 1;
- dst = (uint32_t *)dst + 1;
- }
- if (n & 2) {
- ((struct rte_uint16_alias *)dst)->val =
- ((const struct rte_uint16_alias *)src)->val;
- src = (const uint16_t *)src + 1;
- dst = (uint16_t *)dst + 1;
- }
- if (n & 1)
- *(uint8_t *)dst = *(const uint8_t *)src;
- return ret;
-}
-
/**
* Copy 16 bytes from one location to another,
* locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
#endif
}
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
rte_mov128(dst + 1 * 128, src + 1 * 128);
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 2)
+ return memcpy(dst, src, 2);
+ if (n == 4)
+ return memcpy(dst, src, 4);
+ if (n == 6) /* 4 + 2 */
+ return memcpy(dst, src, 6);
+ if (n == 8)
+ return memcpy(dst, src, 8);
+ if (n == 10) /* 8 + 2 */
+ return memcpy(dst, src, 10);
+ if (n == 12) /* 8 + 4 */
+ return memcpy(dst, src, 12);
+ if (n == 16) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ if (n & 0x18) { /* n >= 8 */
+ /* copy 8 ~ 16 bytes */
+ memcpy(dst, src, 8);
+ memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+ } else if (n & 0x4) {
+ /* copy 4 ~ 7 bytes */
+ memcpy(dst, src, 4);
+ memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+ } else if (n & 0x2) {
+ /* copy 2 ~ 3 bytes */
+ memcpy(dst, src, 2);
+ memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+ } else if (n & 0x1) {
+ /* copy 1 byte */
+ memcpy(dst, src, 1);
+ }
+ return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 16) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 18) /* 16 + 2 */
+ return memcpy(dst, src, 18);
+ if (n == 20) /* 16 + 4 */
+ return memcpy(dst, src, 20);
+ if (n == 24) /* 16 + 8 */
+ return memcpy(dst, src, 24);
+ if (n == 32) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ /* copy 17 (or 16) ~ 32 bytes */
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+ return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+ /* Faster way when size is known at build time. */
+ if (__rte_constant(n)) {
+ if (n == 32) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 36) /* 32 + 4 */
+ return memcpy(dst, src, 36);
+ if (n == 40) /* 32 + 8 */
+ return memcpy(dst, src, 40);
+ if (n == 48) {
+ rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ if (n == 64) {
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ return dst;
+ }
+ }
+
+ /* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ if (n > 48)
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+ return dst;
+}
+
#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
/**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
size_t dstofss;
size_t bits;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (__rte_constant(n) && n == 64) {
- rte_mov64((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
- return ret;
- }
if (n <= 512) {
if (n >= 256) {
n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
size_t dstofss;
size_t bits;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 256 bytes
*/
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
- return ret;
- }
if (n <= 256) {
if (n >= 128) {
n -= 128;
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
} \
}
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
void *ret = dst;
size_t dstofss;
size_t srcofs;
- /**
- * Copy less than 16 bytes
- */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- if (n > 48)
- rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
- return ret;
- }
if (n <= 128) {
goto COPY_BLOCK_128_BACK15;
}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
#endif /* __AVX512F__ */
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
{
void *ret = dst;
- /* Copy size < 16 bytes */
- if (n < 16) {
- return rte_mov15_or_less(dst, src, n);
- }
-
- /* Copy 16 <= size <= 32 bytes */
- if (__rte_constant(n) && n == 32) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 32) {
- rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- if (__rte_constant(n) && n == 16)
- return ret; /* avoid (harmless) duplicate copy */
- rte_mov16((uint8_t *)dst - 16 + n,
- (const uint8_t *)src - 16 + n);
-
- return ret;
- }
-
- /* Copy 32 < size <= 64 bytes */
- if (__rte_constant(n) && n == 64) {
- rte_mov64((uint8_t *)dst, (const uint8_t *)src);
- return ret;
- }
- if (n <= 64) {
- rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n,
- (const uint8_t *)src - 32 + n);
-
- return ret;
- }
-
/* Copy 64 bytes blocks */
for (; n > 64; n -= 64) {
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n)
{
+ if (n <= 16)
+ return rte_mov16_or_less(dst, src, n);
+ if (n <= 32)
+ return rte_mov17_to_32(dst, src, n);
+ if (n <= 64)
+ return rte_mov33_to_64(dst, src, n);
+
+ /* Implementation for size > 64 bytes depends on alignment with vector register size. */
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
- return rte_memcpy_aligned(dst, src, n);
+ return rte_memcpy_aligned_more_than_64(dst, src, n);
else
- return rte_memcpy_generic(dst, src, n);
+ return rte_memcpy_generic_more_than_64(dst, src, n);
}
#undef ALIGNMENT_MASK
--
2.43.0
^ permalink raw reply [flat|nested] 8+ messages in thread