[PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes
@ 2025-10-09  6:30 Sun Yuechi
  2025-10-09  8:17 ` Stephen Hemminger
  0 siblings, 1 reply; 3+ messages in thread
From: Sun Yuechi @ 2025-10-09  6:30 UTC (permalink / raw)
  To: dev; +Cc: Sun Yuechi, Stanisław Kardach, Bruce Richardson

Improve rte_memcpy implementation on RISC-V platform for sizes under
64 bytes, based on the ARM implementation.

Enhanced handling for cases smaller than 64 bytes shows very significant
performance benefits, while the impact is minimal after 64 bytes.

This optimization is disabled by default as a conservative measure,
since future glibc versions may include similar improvements that
could conflict with this implementation.

Use RTE_ARCH_RISCV_MEMCPY to enable this optimization.

Signed-off-by: Sun Yuechi <sunyuechi@iscas.ac.cn>
---
 config/riscv/meson.build           |   5 ++
 lib/eal/riscv/include/rte_memcpy.h | 122 +++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f93ea3e145..73fd0ab4da 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -20,6 +20,11 @@ dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)
 
 # common flags to all riscv builds, with lowest priority
 flags_common = [
+    # Accelerate rte_memcpy for copies smaller than 64 bytes. Be sure to run
+    # the unit test (memcpy_perf_autotest) to verify performance improvements.
+    # Refer to notes in source file (lib/eal/riscv/include/rte_memcpy.h) for
+    # more details.
+    ['RTE_ARCH_RISCV_MEMCPY', false],
     ['RTE_ARCH_RISCV', true],
     ['RTE_CACHE_LINE_SIZE', 64],
     # Manually set wall time clock frequency for the target. If 0, then it is
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..ae6e79e2fc 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -2,6 +2,7 @@
  * Copyright(c) 2022 StarFive
  * Copyright(c) 2022 SiFive
  * Copyright(c) 2022 Semihalf
+ * Copyright(c) 2025 ISCAS
  */
 
 #ifndef RTE_MEMCPY_RISCV_H
@@ -14,6 +15,125 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_RISCV_MEMCPY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This implementation is improved from eal/arm/include/rte_memcpy_64.h,
+ * targeting only cases of < 64 bytes.
+ * Currently shows significant performance improvement over various glibc versions,
+ * but is disabled by default due to uncertainty about potential performance
+ * degradation in future versions.
+ * You can use memcpy_perf_autotest to test the performance.
+ */
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1];
+	dst128[0] = x0;
+	dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+}
+
+static __rte_always_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static __rte_always_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static __rte_always_inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n >= 64)
+		return memcpy(dst, src, n);
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+	return dst;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* RTE_ARCH_RISCV_MEMCPY */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -60,4 +180,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 }
 #endif
 
+#endif /* RTE_ARCH_RISCV_MEMCPY */
+
 #endif /* RTE_MEMCPY_RISCV_H */
-- 
2.51.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes
  2025-10-09  6:30 [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes Sun Yuechi
@ 2025-10-09  8:17 ` Stephen Hemminger
  2025-10-09  8:43   ` sunyuechi
  0 siblings, 1 reply; 3+ messages in thread
From: Stephen Hemminger @ 2025-10-09  8:17 UTC (permalink / raw)
  To: Sun Yuechi; +Cc: dev, Stanisław Kardach, Bruce Richardson

[-- Attachment #1: Type: text/plain, Size: 6082 bytes --]

How does this compare to glibc/gcc memcpy? I would like to see rte_memcpy
go away

On Thu, Oct 9, 2025, 08:32 Sun Yuechi <sunyuechi@iscas.ac.cn> wrote:

> Improve rte_memcpy implementation on RISC-V platform for sizes under
> 64 bytes, based on the ARM implementation.
>
> Enhanced handling for cases smaller than 64 bytes shows very significant
> performance benefits, while the impact is minimal after 64 bytes.
>
> This optimization is disabled by default as a conservative measure,
> since future glibc versions may include similar improvements that
> could conflict with this implementation.
>
> Use RTE_ARCH_RISCV_MEMCPY to enable this optimization.
>
> Signed-off-by: Sun Yuechi <sunyuechi@iscas.ac.cn>
> ---
>  config/riscv/meson.build           |   5 ++
>  lib/eal/riscv/include/rte_memcpy.h | 122 +++++++++++++++++++++++++++++
>  2 files changed, 127 insertions(+)
>
> diff --git a/config/riscv/meson.build b/config/riscv/meson.build
> index f93ea3e145..73fd0ab4da 100644
> --- a/config/riscv/meson.build
> +++ b/config/riscv/meson.build
> @@ -20,6 +20,11 @@ dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)
>
>  # common flags to all riscv builds, with lowest priority
>  flags_common = [
> +    # Accelerate rte_memcpy for copies smaller than 64 bytes. Be sure to
> run
> +    # the unit test (memcpy_perf_autotest) to verify performance
> improvements.
> +    # Refer to notes in source file (lib/eal/riscv/include/rte_memcpy.h)
> for
> +    # more details.
> +    ['RTE_ARCH_RISCV_MEMCPY', false],
>      ['RTE_ARCH_RISCV', true],
>      ['RTE_CACHE_LINE_SIZE', 64],
>      # Manually set wall time clock frequency for the target. If 0, then
> it is
> diff --git a/lib/eal/riscv/include/rte_memcpy.h
> b/lib/eal/riscv/include/rte_memcpy.h
> index d8a942c5d2..ae6e79e2fc 100644
> --- a/lib/eal/riscv/include/rte_memcpy.h
> +++ b/lib/eal/riscv/include/rte_memcpy.h
> @@ -2,6 +2,7 @@
>   * Copyright(c) 2022 StarFive
>   * Copyright(c) 2022 SiFive
>   * Copyright(c) 2022 Semihalf
> + * Copyright(c) 2025 ISCAS
>   */
>
>  #ifndef RTE_MEMCPY_RISCV_H
> @@ -14,6 +15,125 @@
>
>  #include "generic/rte_memcpy.h"
>
> +#ifdef RTE_ARCH_RISCV_MEMCPY
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/*
> + * This implementation is improved from eal/arm/include/rte_memcpy_64.h,
> + * targeting only cases of < 64 bytes.
> + * Currently shows significant performance improvement over various glibc
> versions,
> + * but is disabled by default due to uncertainty about potential
> performance
> + * degradation in future versions.
> + * You can use memcpy_perf_autotest to test the performance.
> + */
> +
> +static __rte_always_inline
> +void rte_mov16(uint8_t *dst, const uint8_t *src)
> +{
> +       __uint128_t *dst128 = (__uint128_t *)dst;
> +       const __uint128_t *src128 = (const __uint128_t *)src;
> +       *dst128 = *src128;
> +}
> +
> +static __rte_always_inline
> +void rte_mov32(uint8_t *dst, const uint8_t *src)
> +{
> +       __uint128_t *dst128 = (__uint128_t *)dst;
> +       const __uint128_t *src128 = (const __uint128_t *)src;
> +       const __uint128_t x0 = src128[0], x1 = src128[1];
> +       dst128[0] = x0;
> +       dst128[1] = x1;
> +}
> +
> +static __rte_always_inline
> +void rte_mov48(uint8_t *dst, const uint8_t *src)
> +{
> +       __uint128_t *dst128 = (__uint128_t *)dst;
> +       const __uint128_t *src128 = (const __uint128_t *)src;
> +       const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
> +       dst128[0] = x0;
> +       dst128[1] = x1;
> +       dst128[2] = x2;
> +}
> +
> +static __rte_always_inline void
> +rte_mov64(uint8_t *dst, const uint8_t *src)
> +{
> +       memcpy(dst, src, 64);
> +}
> +
> +static __rte_always_inline void
> +rte_mov128(uint8_t *dst, const uint8_t *src)
> +{
> +       memcpy(dst, src, 128);
> +}
> +
> +static __rte_always_inline void
> +rte_mov256(uint8_t *dst, const uint8_t *src)
> +{
> +       memcpy(dst, src, 256);
> +}
> +
> +static __rte_always_inline void
> +rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
> +{
> +       if (n & 0x08) {
> +               /* copy 8 ~ 15 bytes */
> +               *(uint64_t *)dst = *(const uint64_t *)src;
> +               *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 +
> n);
> +       } else if (n & 0x04) {
> +               /* copy 4 ~ 7 bytes */
> +               *(uint32_t *)dst = *(const uint32_t *)src;
> +               *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 +
> n);
> +       } else if (n & 0x02) {
> +               /* copy 2 ~ 3 bytes */
> +               *(uint16_t *)dst = *(const uint16_t *)src;
> +               *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 +
> n);
> +       } else if (n & 0x01) {
> +               /* copy 1 byte */
> +               *dst = *src;
> +       }
> +}
> +
> +static __rte_always_inline void
> +rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
> +{
> +       if (n == 16) {
> +               rte_mov16(dst, src);
> +       } else if (n <= 32) {
> +               rte_mov16(dst, src);
> +               rte_mov16(dst - 16 + n, src - 16 + n);
> +       } else if (n <= 48) {
> +               rte_mov32(dst, src);
> +               rte_mov16(dst - 16 + n, src - 16 + n);
> +       } else {
> +               rte_mov48(dst, src);
> +               rte_mov16(dst - 16 + n, src - 16 + n);
> +       }
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy(void *dst, const void *src, size_t n)
> +{
> +       if (n >= 64)
> +               return memcpy(dst, src, n);
> +       if (n < 16) {
> +               rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +               return dst;
> +       }
> +       rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +       return dst;
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#else /* RTE_ARCH_RISCV_MEMCPY */
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> @@ -60,4 +180,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
>  }
>  #endif
>
> +#endif /* RTE_ARCH_RISCV_MEMCPY */
> +
>  #endif /* RTE_MEMCPY_RISCV_H */
> --
> 2.51.0
>
>

[-- Attachment #2: Type: text/html, Size: 7439 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Re: [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes
  2025-10-09  8:17 ` Stephen Hemminger
@ 2025-10-09  8:43   ` sunyuechi
  0 siblings, 0 replies; 3+ messages in thread
From: sunyuechi @ 2025-10-09  8:43 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Stanisław Kardach, Bruce Richardson

[-- Attachment #1: Type: text/plain, Size: 9106 bytes --]

For RISC-V, gcc 14.2, glibc 2.39, when <64 bytes, the test data is as follows. 
It appears the impact is quite significant.

================================= 16B aligned =================================
      1  0 -  1(-80.05%)   1 -  1(-10.75%)   2 - 11(-85.80%)   2 - 11(-79.70%) 
      2  0 -  1(-74.77%)   2 -  8(-68.29%)   3 - 12(-74.64%)   3 - 12(-74.01%) 
      3  0 -  1(-78.86%)   2 -  8(-69.46%)   3 - 12(-75.36%)   3 - 13(-76.22%) 
      4  0 -  1(-86.02%)   2 -  8(-74.27%)   2 - 12(-79.61%)   3 - 13(-79.16%) 
      5  0 -  1(-86.29%)   2 -  8(-74.46%)   2 - 12(-79.00%)   3 - 13(-79.73%) 
      6  0 -  1(-86.22%)   2 -  8(-73.82%)   2 - 12(-79.18%)   3 - 13(-79.23%) 
      7  0 -  1(-89.68%)   2 -  8(-73.81%)   2 - 12(-79.54%)   3 - 13(-78.50%) 
      8  0 -  1(-90.05%)   2 -  8(-75.88%)   3 - 12(-75.96%)   3 - 13(-77.93%) 
      9  0 -  1(-89.85%)   2 -  8(-76.17%)   3 - 12(-76.74%)   3 - 13(-77.49%) 
     12  0 -  1(-91.32%)   2 -  8(-76.92%)   3 - 12(-75.98%)   3 - 13(-77.69%) 
     15  0 -  1(-91.46%)   2 -  8(-77.27%)   3 - 12(-76.36%)   3 - 13(-78.41%) 
     16  0 -  1(-89.70%)   2 -  8(-74.81%)   3 - 12(-75.35%)   3 - 12(-77.52%) 
     17  0 -  1(-81.57%)   3 -  8(-60.92%)   4 - 12(-66.96%)   5 - 13(-64.20%) 
     31  0 -  1(-87.58%)   3 -  8(-62.66%)   4 - 12(-68.48%)   5 - 13(-65.12%) 
     32  0 -  1(-84.06%)   3 -  8(-67.48%)   4 - 12(-68.33%)   4 - 13(-65.48%) 
     33  0 -  1(-74.64%)   4 -  8(-50.45%)   6 - 12(-51.16%)   7 - 13(-45.94%) 
     63  0 -  1(-79.33%)   5 -  9(-47.70%)   6 - 13(-49.47%)   9 - 13(-32.40%) 


================================== Unaligned ==================================
      1  0 -  1(-80.49%)   1 -  1(-15.31%)   2 - 11(-85.77%)   2 - 12(-80.65%) 
      2  0 -  1(-78.18%)   2 -  8(-72.49%)   3 - 12(-75.34%)   3 - 12(-74.68%) 
      3  0 -  1(-79.49%)   2 -  8(-73.40%)   3 - 12(-75.05%)   3 - 14(-76.72%) 
      4  0 -  1(-86.27%)   2 -  8(-74.48%)   2 - 12(-79.56%)   3 - 13(-79.13%) 
      5  0 -  1(-86.59%)   2 -  8(-74.54%)   2 - 12(-79.04%)   3 - 12(-77.99%) 
      6  0 -  1(-87.06%)   2 -  8(-74.01%)   2 - 12(-79.09%)   3 - 12(-78.04%) 
      7  0 -  1(-90.86%)   2 -  8(-74.09%)   2 - 12(-79.86%)   3 - 12(-78.32%) 
      8  0 -  1(-89.78%)   2 -  8(-77.01%)   3 - 12(-76.51%)   3 - 14(-79.29%) 
      9  0 -  1(-89.19%)   2 -  8(-75.99%)   3 - 12(-76.25%)   3 - 14(-79.28%) 
     12  0 -  1(-89.11%)   2 -  8(-74.25%)   3 - 12(-74.19%)   3 - 14(-77.68%) 
     15  0 -  1(-90.02%)   2 -  8(-75.39%)   3 - 12(-74.73%)   3 - 15(-78.67%) 
     16  0 -  1(-80.96%)   2 -  8(-74.49%)   3 - 14(-78.17%)   3 - 14(-77.11%) 
     17  0 -  1(-66.29%)   3 - 10(-66.85%)   4 - 15(-72.42%)   6 - 14(-61.18%) 
     31  0 -  1(-86.84%)   3 -  9(-63.75%)   4 - 13(-65.88%)   5 - 15(-64.67%) 
     32  0 -  1(-87.37%)   3 -  8(-61.34%)   4 - 12(-65.09%)   6 - 15(-64.04%) 
     33  0 -  2(-84.23%)   5 - 10(-46.98%)   6 - 14(-57.24%)   8 - 16(-53.88%) 
     63  0 -  2(-81.59%)   5 - 11(-52.12%)   7 - 16(-54.06%)  10 - 20(-50.01%) 






-----原始邮件-----
发件人:"Stephen Hemminger" <stephen@networkplumber.org>
发送时间:2025-10-09 16:17:02 (星期四)
收件人: "Sun Yuechi" <sunyuechi@iscas.ac.cn>
抄送: dev <dev@dpdk.org>, "Stanisław Kardach" <stanislaw.kardach@gmail.com>, "Bruce Richardson" <bruce.richardson@intel.com>
主题: Re: [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes


How does this compare to glibc/gcc memcpy? I would like to see rte_memcpy go away 


On Thu, Oct 9, 2025, 08:32 Sun Yuechi <sunyuechi@iscas.ac.cn> wrote:

Improve rte_memcpy implementation on RISC-V platform for sizes under
64 bytes, based on the ARM implementation.

Enhanced handling for cases smaller than 64 bytes shows very significant
performance benefits, while the impact is minimal after 64 bytes.

This optimization is disabled by default as a conservative measure,
since future glibc versions may include similar improvements that
could conflict with this implementation.

Use RTE_ARCH_RISCV_MEMCPY to enable this optimization.

Signed-off-by: Sun Yuechi <sunyuechi@iscas.ac.cn>
---
 config/riscv/meson.build           |   5 ++
 lib/eal/riscv/include/rte_memcpy.h | 122 +++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f93ea3e145..73fd0ab4da 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -20,6 +20,11 @@ dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)

 # common flags to all riscv builds, with lowest priority
 flags_common = [
+    # Accelerate rte_memcpy for copies smaller than 64 bytes. Be sure to run
+    # the unit test (memcpy_perf_autotest) to verify performance improvements.
+    # Refer to notes in source file (lib/eal/riscv/include/rte_memcpy.h) for
+    # more details.
+    ['RTE_ARCH_RISCV_MEMCPY', false],
     ['RTE_ARCH_RISCV', true],
     ['RTE_CACHE_LINE_SIZE', 64],
     # Manually set wall time clock frequency for the target. If 0, then it is
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..ae6e79e2fc 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -2,6 +2,7 @@
  * Copyright(c) 2022 StarFive
  * Copyright(c) 2022 SiFive
  * Copyright(c) 2022 Semihalf
+ * Copyright(c) 2025 ISCAS
  */

 #ifndef RTE_MEMCPY_RISCV_H
@@ -14,6 +15,125 @@

 #include "generic/rte_memcpy.h"

+#ifdef RTE_ARCH_RISCV_MEMCPY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This implementation is improved from eal/arm/include/rte_memcpy_64.h,
+ * targeting only cases of < 64 bytes.
+ * Currently shows significant performance improvement over various glibc versions,
+ * but is disabled by default due to uncertainty about potential performance
+ * degradation in future versions.
+ * You can use memcpy_perf_autotest to test the performance.
+ */
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       *dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       const __uint128_t x0 = src128[0], x1 = src128[1];
+       dst128[0] = x0;
+       dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+       dst128[0] = x0;
+       dst128[1] = x1;
+       dst128[2] = x2;
+}
+
+static __rte_always_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 64);
+}
+
+static __rte_always_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 128);
+}
+
+static __rte_always_inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 256);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       if (n & 0x08) {
+               /* copy 8 ~ 15 bytes */
+               *(uint64_t *)dst = *(const uint64_t *)src;
+               *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+       } else if (n & 0x04) {
+               /* copy 4 ~ 7 bytes */
+               *(uint32_t *)dst = *(const uint32_t *)src;
+               *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+       } else if (n & 0x02) {
+               /* copy 2 ~ 3 bytes */
+               *(uint16_t *)dst = *(const uint16_t *)src;
+               *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+       } else if (n & 0x01) {
+               /* copy 1 byte */
+               *dst = *src;
+       }
+}
+
+static __rte_always_inline void
+rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       if (n == 16) {
+               rte_mov16(dst, src);
+       } else if (n <= 32) {
+               rte_mov16(dst, src);
+               rte_mov16(dst - 16 + n, src - 16 + n);
+       } else if (n <= 48) {
+               rte_mov32(dst, src);
+               rte_mov16(dst - 16 + n, src - 16 + n);
+       } else {
+               rte_mov48(dst, src);
+               rte_mov16(dst - 16 + n, src - 16 + n);
+       }
+}
+
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+       if (n >= 64)
+               return memcpy(dst, src, n);
+       if (n < 16) {
+               rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       }
+       rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+       return dst;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* RTE_ARCH_RISCV_MEMCPY */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -60,4 +180,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 }
 #endif

+#endif /* RTE_ARCH_RISCV_MEMCPY */
+
 #endif /* RTE_MEMCPY_RISCV_H */
--
2.51.0


[-- Attachment #2: Type: text/html, Size: 16339 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-10-09  8:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-09  6:30 [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes Sun Yuechi
2025-10-09  8:17 ` Stephen Hemminger
2025-10-09  8:43   ` sunyuechi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).