How does this compare to glibc/gcc memcpy? I would like to see rte_memcpy go away On Thu, Oct 9, 2025, 08:32 Sun Yuechi wrote: > Improve rte_memcpy implementation on RISC-V platform for sizes under > 64 bytes, based on the ARM implementation. > > Enhanced handling for cases smaller than 64 bytes shows very significant > performance benefits, while the impact is minimal after 64 bytes. > > This optimization is disabled by default as a conservative measure, > since future glibc versions may include similar improvements that > could conflict with this implementation. > > Use RTE_ARCH_RISCV_MEMCPY to enable this optimization. > > Signed-off-by: Sun Yuechi > --- > config/riscv/meson.build | 5 ++ > lib/eal/riscv/include/rte_memcpy.h | 122 +++++++++++++++++++++++++++++ > 2 files changed, 127 insertions(+) > > diff --git a/config/riscv/meson.build b/config/riscv/meson.build > index f93ea3e145..73fd0ab4da 100644 > --- a/config/riscv/meson.build > +++ b/config/riscv/meson.build > @@ -20,6 +20,11 @@ dpdk_conf.set('RTE_FORCE_INTRINSICS', 1) > > # common flags to all riscv builds, with lowest priority > flags_common = [ > + # Accelerate rte_memcpy for copies smaller than 64 bytes. Be sure to > run > + # the unit test (memcpy_perf_autotest) to verify performance > improvements. > + # Refer to notes in source file (lib/eal/riscv/include/rte_memcpy.h) > for > + # more details. > + ['RTE_ARCH_RISCV_MEMCPY', false], > ['RTE_ARCH_RISCV', true], > ['RTE_CACHE_LINE_SIZE', 64], > # Manually set wall time clock frequency for the target. If 0, then > it is > diff --git a/lib/eal/riscv/include/rte_memcpy.h > b/lib/eal/riscv/include/rte_memcpy.h > index d8a942c5d2..ae6e79e2fc 100644 > --- a/lib/eal/riscv/include/rte_memcpy.h > +++ b/lib/eal/riscv/include/rte_memcpy.h > @@ -2,6 +2,7 @@ > * Copyright(c) 2022 StarFive > * Copyright(c) 2022 SiFive > * Copyright(c) 2022 Semihalf > + * Copyright(c) 2025 ISCAS > */ > > #ifndef RTE_MEMCPY_RISCV_H > @@ -14,6 +15,125 @@ > > #include "generic/rte_memcpy.h" > > +#ifdef RTE_ARCH_RISCV_MEMCPY > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +/* > + * This implementation is improved from eal/arm/include/rte_memcpy_64.h, > + * targeting only cases of < 64 bytes. > + * Currently shows significant performance improvement over various glibc > versions, > + * but is disabled by default due to uncertainty about potential > performance > + * degradation in future versions. > + * You can use memcpy_perf_autotest to test the performance. > + */ > + > +static __rte_always_inline > +void rte_mov16(uint8_t *dst, const uint8_t *src) > +{ > + __uint128_t *dst128 = (__uint128_t *)dst; > + const __uint128_t *src128 = (const __uint128_t *)src; > + *dst128 = *src128; > +} > + > +static __rte_always_inline > +void rte_mov32(uint8_t *dst, const uint8_t *src) > +{ > + __uint128_t *dst128 = (__uint128_t *)dst; > + const __uint128_t *src128 = (const __uint128_t *)src; > + const __uint128_t x0 = src128[0], x1 = src128[1]; > + dst128[0] = x0; > + dst128[1] = x1; > +} > + > +static __rte_always_inline > +void rte_mov48(uint8_t *dst, const uint8_t *src) > +{ > + __uint128_t *dst128 = (__uint128_t *)dst; > + const __uint128_t *src128 = (const __uint128_t *)src; > + const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2]; > + dst128[0] = x0; > + dst128[1] = x1; > + dst128[2] = x2; > +} > + > +static __rte_always_inline void > +rte_mov64(uint8_t *dst, const uint8_t *src) > +{ > + memcpy(dst, src, 64); > +} > + > +static __rte_always_inline void > +rte_mov128(uint8_t *dst, const uint8_t *src) > +{ > + memcpy(dst, src, 128); > +} > + > +static __rte_always_inline void > +rte_mov256(uint8_t *dst, const uint8_t *src) > +{ > + memcpy(dst, src, 256); > +} > + > +static __rte_always_inline void > +rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n) > +{ > + if (n & 0x08) { > + /* copy 8 ~ 15 bytes */ > + *(uint64_t *)dst = *(const uint64_t *)src; > + *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + > n); > + } else if (n & 0x04) { > + /* copy 4 ~ 7 bytes */ > + *(uint32_t *)dst = *(const uint32_t *)src; > + *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + > n); > + } else if (n & 0x02) { > + /* copy 2 ~ 3 bytes */ > + *(uint16_t *)dst = *(const uint16_t *)src; > + *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + > n); > + } else if (n & 0x01) { > + /* copy 1 byte */ > + *dst = *src; > + } > +} > + > +static __rte_always_inline void > +rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n) > +{ > + if (n == 16) { > + rte_mov16(dst, src); > + } else if (n <= 32) { > + rte_mov16(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } else if (n <= 48) { > + rte_mov32(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } else { > + rte_mov48(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } > +} > + > +static __rte_always_inline void * > +rte_memcpy(void *dst, const void *src, size_t n) > +{ > + if (n >= 64) > + return memcpy(dst, src, n); > + if (n < 16) { > + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } > + rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > +} > + > +#ifdef __cplusplus > +} > +#endif > + > +#else /* RTE_ARCH_RISCV_MEMCPY */ > + > #ifdef __cplusplus > extern "C" { > #endif > @@ -60,4 +180,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src) > } > #endif > > +#endif /* RTE_ARCH_RISCV_MEMCPY */ > + > #endif /* RTE_MEMCPY_RISCV_H */ > -- > 2.51.0 > >