How does this compare to glibc/gcc memcpy? I would like to see rte_memcpy go away
Improve rte_memcpy implementation on RISC-V platform for sizes under
64 bytes, based on the ARM implementation.
Enhanced handling for cases smaller than 64 bytes shows very significant
performance benefits, while the impact is minimal after 64 bytes.
This optimization is disabled by default as a conservative measure,
since future glibc versions may include similar improvements that
could conflict with this implementation.
Use RTE_ARCH_RISCV_MEMCPY to enable this optimization.
Signed-off-by: Sun Yuechi <sunyuechi@iscas.ac.cn>
---
config/riscv/meson.build | 5 ++
lib/eal/riscv/include/rte_memcpy.h | 122 +++++++++++++++++++++++++++++
2 files changed, 127 insertions(+)
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f93ea3e145..73fd0ab4da 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -20,6 +20,11 @@ dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)
# common flags to all riscv builds, with lowest priority
flags_common = [
+ # Accelerate rte_memcpy for copies smaller than 64 bytes. Be sure to run
+ # the unit test (memcpy_perf_autotest) to verify performance improvements.
+ # Refer to notes in source file (lib/eal/riscv/include/rte_memcpy.h) for
+ # more details.
+ ['RTE_ARCH_RISCV_MEMCPY', false],
['RTE_ARCH_RISCV', true],
['RTE_CACHE_LINE_SIZE', 64],
# Manually set wall time clock frequency for the target. If 0, then it is
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..ae6e79e2fc 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -2,6 +2,7 @@
* Copyright(c) 2022 StarFive
* Copyright(c) 2022 SiFive
* Copyright(c) 2022 Semihalf
+ * Copyright(c) 2025 ISCAS
*/
#ifndef RTE_MEMCPY_RISCV_H
@@ -14,6 +15,125 @@
#include "generic/rte_memcpy.h"
+#ifdef RTE_ARCH_RISCV_MEMCPY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This implementation is improved from eal/arm/include/rte_memcpy_64.h,
+ * targeting only cases of < 64 bytes.
+ * Currently shows significant performance improvement over various glibc versions,
+ * but is disabled by default due to uncertainty about potential performance
+ * degradation in future versions.
+ * You can use memcpy_perf_autotest to test the performance.
+ */
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+ __uint128_t *dst128 = (__uint128_t *)dst;
+ const __uint128_t *src128 = (const __uint128_t *)src;
+ *dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+ __uint128_t *dst128 = (__uint128_t *)dst;
+ const __uint128_t *src128 = (const __uint128_t *)src;
+ const __uint128_t x0 = src128[0], x1 = src128[1];
+ dst128[0] = x0;
+ dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+ __uint128_t *dst128 = (__uint128_t *)dst;
+ const __uint128_t *src128 = (const __uint128_t *)src;
+ const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+ dst128[0] = x0;
+ dst128[1] = x1;
+ dst128[2] = x2;
+}
+
+static __rte_always_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+ memcpy(dst, src, 64);
+}
+
+static __rte_always_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+ memcpy(dst, src, 128);
+}
+
+static __rte_always_inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+ memcpy(dst, src, 256);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+ if (n & 0x08) {
+ /* copy 8 ~ 15 bytes */
+ *(uint64_t *)dst = *(const uint64_t *)src;
+ *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+ } else if (n & 0x04) {
+ /* copy 4 ~ 7 bytes */
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+ } else if (n & 0x02) {
+ /* copy 2 ~ 3 bytes */
+ *(uint16_t *)dst = *(const uint16_t *)src;
+ *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+ } else if (n & 0x01) {
+ /* copy 1 byte */
+ *dst = *src;
+ }
+}
+
+static __rte_always_inline void
+rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+ if (n == 16) {
+ rte_mov16(dst, src);
+ } else if (n <= 32) {
+ rte_mov16(dst, src);
+ rte_mov16(dst - 16 + n, src - 16 + n);
+ } else if (n <= 48) {
+ rte_mov32(dst, src);
+ rte_mov16(dst - 16 + n, src - 16 + n);
+ } else {
+ rte_mov48(dst, src);
+ rte_mov16(dst - 16 + n, src - 16 + n);
+ }
+}
+
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+ if (n >= 64)
+ return memcpy(dst, src, n);
+ if (n < 16) {
+ rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+ return dst;
+ }
+ rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+ return dst;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* RTE_ARCH_RISCV_MEMCPY */
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -60,4 +180,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
}
#endif
+#endif /* RTE_ARCH_RISCV_MEMCPY */
+
#endif /* RTE_MEMCPY_RISCV_H */
--
2.51.0