[PATCH v1 0/2] Optimization Summary for RISC-V rte

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy
@ 2025-10-16  9:09 Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 2/2] benchmark report for rte_memcpy Qiguo Chen
  0 siblings, 2 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-16  9:09 UTC (permalink / raw)
  To: stanislaw.kardach, sunyuechi, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 965 bytes --]

I've implemented optimizations to rte_memcpy targeting RISC-V architectures, 
achieving an average 10%~15% reduction in execution time for data sizes between
129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
 specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128)


Qiguo Chen (2):
  riscv support rte_memcpy in vector
  benchmark report for rte_memcpy

 .mailmap                           |   1 +
 benchmark_report.txt               | 149 ++++++++++++++
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 4 files changed, 472 insertions(+), 2 deletions(-)
 create mode 100644 benchmark_report.txt

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 1861 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v1 1/2] riscv support rte_memcpy in vector
  2025-10-16  9:09 [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-16  9:09 ` Qiguo Chen
  2025-10-17  5:29   ` sunyuechi
  2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 2/2] benchmark report for rte_memcpy Qiguo Chen
  1 sibling, 2 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-16  9:09 UTC (permalink / raw)
  To: stanislaw.kardach, sunyuechi, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9838 bytes --]

This patch implements RISC-V vector intrinsics
to accelerate memory copy operations for byte range (129~1600).

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy(dst, src, n);
+	/*else*/
+#endif
+		return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26648 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v1 2/2] benchmark report for rte_memcpy
  2025-10-16  9:09 [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
@ 2025-10-16  9:09 ` Qiguo Chen
  1 sibling, 0 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-16  9:09 UTC (permalink / raw)
  To: stanislaw.kardach, sunyuechi, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 12710 bytes --]

Benchmark results show 10~15% reduction in execution time for
  data sizes (129~1024)

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 benchmark_report.txt | 149 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 benchmark_report.txt

diff --git a/benchmark_report.txt b/benchmark_report.txt
new file mode 100644
index 0000000000..499d3fc5f0
--- /dev/null
+++ b/benchmark_report.txt
@@ -0,0 +1,149 @@
+================================= 16B aligned =================================
+      1  0 -  0( 24.27%)   1 -  1( 13.14%)   2 -  2( -3.48%)   3 -  3(  2.70%) 
+      2  0 -  0( 21.92%)   1 -  1(  4.75%)   2 -  2( -3.58%)   3 -  3(  3.05%) 
+      3  0 -  0( 23.54%)   1 -  1(  9.74%)   2 -  2( -4.24%)   3 -  3(  2.26%) 
+      4  0 -  0( 22.54%)   1 -  1(  7.10%)   2 -  2( -3.96%)   3 -  3(  0.99%) 
+      5  0 -  0( 22.73%)   1 -  1(  9.02%)   2 -  2( -3.18%)   3 -  3(  1.60%) 
+      6  0 -  0( 56.22%)   1 -  1(  8.21%)   2 -  2( -3.65%)   3 -  3(  1.10%) 
+      7  0 -  0( 23.07%)   1 -  1(  6.82%)   2 -  2( -3.53%)   3 -  3(  3.46%) 
+      8  0 -  0( 23.49%)   1 -  1(  7.70%)   2 -  2( -0.26%)   3 -  3(  2.22%) 
+      9  0 -  0( 56.70%)   1 -  1(  7.04%)   2 -  2( -3.75%)   3 -  3(  2.52%) 
+     12  0 -  0( 23.87%)   1 -  1(  5.80%)   2 -  2( -3.76%)   3 -  3(  1.49%) 
+     15  0 -  0( 22.95%)   1 -  1(  5.01%)   2 -  2( -3.52%)   3 -  3(  2.82%) 
+     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) 
+     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) 
+     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) 
+     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) 
+     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) 
+     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) 
+     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 
+     65  0 -  0( 20.60%)   5 -  4( 31.05%)   6 -  5( 24.16%)   8 -  7(  5.69%) 
+    127  0 -  0( 18.22%)   6 -  6(  9.34%)   8 -  7(  9.72%)  11 - 11(  2.73%) 
+    128  0 -  0( 39.80%)   6 -  6( -0.93%)   8 -  7(  9.65%)  11 - 11(  4.63%) 
+    129  0 -  1(-50.92%)   6 -  7( -4.00%)   9 - 12(-28.67%)  11 - 16(-34.28%) 
+    191  1 -  1(-45.09%)   9 -  9(  5.04%)  12 - 13(-11.82%)  13 - 16(-15.66%) 
+    192  1 -  1(-43.44%)   7 -  9(-18.67%)  12 - 13( -5.92%)  13 - 15(-18.50%) 
+    193  1 -  1(-24.84%)   9 -  9( -5.60%)  12 - 13( -7.44%)  14 - 17(-14.15%) 
+    255  1 -  1(-23.65%)  11 - 11( -4.57%)  13 - 13( -3.46%)  16 - 18( -8.81%) 
+    256  1 -  1( 16.87%)   9 - 11(-13.78%)  14 - 13(  8.58%)  16 - 16(  5.20%) 
+    257  1 -  1(-15.41%)  12 - 13( -6.90%)  15 - 16( -6.71%)  18 - 19( -6.35%) 
+    319  1 -  1(-12.93%)  15 - 19(-18.96%)  17 - 17( -0.55%)  21 - 21( -1.25%) 
+    320  1 -  1(-16.38%)  10 - 17(-39.05%)  18 - 17(  4.65%)  20 - 20( -2.94%) 
+    321  1 -  1( -6.24%)  12 - 19(-36.30%)  18 - 17(  6.65%)  20 - 22( -8.86%) 
+    383  1 -  1( -4.06%)  16 - 20(-17.87%)  18 - 17(  9.18%)  23 - 23(  1.42%) 
+    384  1 -  1( 12.87%)  11 - 18(-36.31%)  18 - 18(  1.92%)  20 - 22( -8.22%) 
+    385  2 -  2( 26.46%)  11 - 20(-46.76%)  15 - 20(-22.07%)  19 - 24(-18.04%) 
+    447  2 -  1( 55.03%)  14 - 21(-34.10%)  15 - 20(-22.75%)  21 - 27(-23.99%) 
+    448  2 -  1( 18.00%)  12 - 20(-38.82%)  16 - 20(-20.82%)  21 - 25(-18.74%) 
+    449  4 -  2(141.90%)  13 - 22(-42.36%)  16 - 20(-22.84%)  21 - 26(-18.35%) 
+    511  3 -  2( 57.68%)  14 - 23(-37.60%)  16 - 20(-18.33%)  21 - 28(-22.10%) 
+    512  2 -  1( 27.98%)  12 - 21(-40.06%)  17 - 20(-15.21%)  21 - 26(-19.65%) 
+    513  2 -  2( 22.93%)  13 - 23(-43.25%)  18 - 22(-19.53%)  23 - 31(-26.70%) 
+    767  7 -  6( 29.60%)  21 - 29(-28.37%)  29 - 23( 29.04%)  32 - 35( -9.38%) 
+    768  6 -  3( 96.51%)  19 - 27(-29.32%)  23 - 21(  6.62%)  31 - 33( -6.22%) 
+    769  7 -  4( 94.30%)  21 - 28(-27.50%)  25 - 24(  3.23%)  32 - 37(-12.00%) 
+   1023  8 -  5( 72.12%)  25 - 34(-27.27%)  34 - 26( 33.59%)  37 - 42(-11.18%) 
+   1024  8 -  6( 41.80%)  23 - 32(-26.49%)  26 - 25(  4.23%)  37 - 40( -7.72%) 
+   1025  8 -  7(  9.36%)  25 - 34(-25.78%)  29 - 27(  7.68%)  38 - 42( -8.87%) 
+   1518  7 -  4( 71.47%)  34 - 45(-24.17%)  45 - 30( 47.69%)  51 - 53( -4.93%) 
+   1522 10 -  8( 19.45%)  35 - 45(-23.62%)  46 - 31( 47.81%)  51 - 52( -0.49%) 
+   1536 10 -  6( 62.55%)  32 - 42(-23.80%)  37 - 29( 29.19%)  50 - 51( -2.70%) 
+   1600 11 -  9( 20.69%)  34 - 43(-21.19%)  47 - 32( 45.63%)  49 - 53( -7.68%) 
+   2048 13 - 10( 26.67%)  53 - 53( -0.25%)  37 - 35(  7.16%)  61 - 62( -0.90%) 
+   2560 16 - 13( 25.07%)  62 - 59(  5.23%)  44 - 45( -0.71%)  71 - 70(  1.05%) 
+   3072 20 - 20(  1.91%)  72 - 71(  1.91%)  49 - 50( -3.36%)  82 - 82( -0.59%) 
+   3584 26 - 26( -0.81%)  81 - 81( -0.17%)  58 - 57(  1.17%)  92 - 91(  1.28%) 
+   4096 25 - 27( -9.39%)  90 - 90(  0.54%)  64 - 63(  0.67%) 102 -102(  0.70%) 
+   4608 31 - 27( 18.45%)  99 - 99( -0.00%)  70 - 70(  0.47%) 111 -111(  0.09%) 
+   5120 41 - 35( 16.65%) 108 -108( -0.28%)  78 - 77(  0.52%) 120 -120(  0.37%) 
+   5632 46 - 47( -2.05%) 117 -117(  0.12%)  85 - 85(  0.38%) 130 -130( -0.19%) 
+   6144 52 - 44( 18.06%) 126 -126(  0.01%)  94 - 93(  0.80%) 139 -138(  0.27%) 
+   6656 27 - 41(-33.88%) 135 -134(  0.33%) 102 -102(  0.52%) 149 -148(  1.11%) 
+   7168 56 - 27(104.91%) 143 -142(  0.33%) 110 -110(  0.15%) 157 -157(  0.07%) 
+   7680 66 - 70( -5.18%) 152 -152(  0.03%) 118 -117(  0.27%) 166 -166(  0.17%) 
+   8192 69 - 44( 57.50%) 161 -160(  0.45%) 125 -124(  0.35%) 176 -175(  0.41%) 
+------- ----------------- ----------------- ----------------- -----------------
+C     6  0 -  0( -1.10%)   1 -  1(  9.45%)   2 -  2( -0.19%)   3 -  3(  2.77%) 
+C    64  0 -  0(  0.60%)   3 -  3(  1.28%)   4 -  4( -0.18%)   6 -  6(  0.50%) 
+C   128  0 -  0( 35.46%)   6 -  6( -3.33%)   8 -  7(  7.02%)  11 - 11(  1.72%) 
+C   192  0 -  1(-48.74%)   7 -  8(-20.51%)  12 - 13(-12.42%)  12 - 15(-22.26%) 
+C   256  1 -  1( 11.88%)   9 - 11(-15.05%)  13 - 13(  0.17%)  15 - 16( -1.65%) 
+C   512  2 -  1( 27.80%)  13 - 22(-40.28%)  16 - 19(-12.48%)  22 - 25(-13.57%) 
+C   768  2 -  2( 11.66%)  18 - 26(-30.06%)  23 - 21(  5.93%)  31 - 33( -7.73%) 
+C  1024  6 -  4( 32.78%)  23 - 31(-25.36%)  26 - 24(  5.56%)  37 - 39( -6.05%) 
+C  1536  9 -  7( 33.48%)  32 - 43(-23.71%)  37 - 29( 26.46%)  50 - 50( -0.05%) 
+================================== Unaligned ==================================
+      1  0 -  0( 32.71%)   1 -  1(  7.91%)   2 -  2(  0.99%)   3 -  3(  3.36%) 
+      2  0 -  0( 33.59%)   1 -  1(  6.69%)   2 -  2(  1.04%)   3 -  3(  1.19%) 
+      3  0 -  0( 33.20%)   1 -  1(  8.36%)   2 -  2(  0.87%)   3 -  3(  3.03%) 
+      4  0 -  0( 33.41%)   1 -  1(  6.50%)   2 -  2(  1.03%)   3 -  3(  2.77%) 
+      5  0 -  0( 32.00%)   1 -  1(  6.83%)   2 -  2(  1.16%)   3 -  3(  2.28%) 
+      6  0 -  0( 33.29%)   1 -  1(  7.94%)   2 -  2(  0.93%)   3 -  3(  0.17%) 
+      7  0 -  0( 32.69%)   1 -  1(  6.01%)   2 -  2(  0.93%)   3 -  2(  4.20%) 
+      8  0 -  0( 33.99%)   1 -  1(  5.62%)   2 -  2(  0.92%)   3 -  3(  1.09%) 
+      9  0 -  0( 32.63%)   1 -  1(  6.33%)   2 -  2(  1.13%)   3 -  3(  2.01%) 
+     12  0 -  0( 33.10%)   1 -  1(  7.30%)   4 -  3( 47.16%)   5 -  3( 41.00%) 
+     15  0 -  0( 32.30%)   1 -  1(  6.96%)   4 -  3( 47.34%)   5 -  3( 43.19%) 
+     16  0 -  0( 18.41%)   3 -  2( 68.45%)   4 -  3( 62.20%)   5 -  3( 35.47%) 
+     17  0 -  0(  7.81%)   4 -  3( 37.51%)   5 -  3( 59.08%)   6 -  4( 40.54%) 
+     31  0 -  0( 33.54%)   4 -  3( 31.79%)   6 -  4( 47.27%)   6 -  4( 39.17%) 
+     32  0 -  0( 32.98%)   4 -  3( 29.22%)   6 -  4( 46.89%)   6 -  5( 35.76%) 
+     33  0 -  0( 27.50%)   4 -  4(  6.37%)   6 -  5( 34.85%)   7 -  6( 19.56%) 
+     63  0 -  0( 44.23%)   5 -  5( 19.68%)   7 -  7(  3.62%)   9 -  9(  7.96%) 
+     64  0 -  0( 29.92%)   5 -  5( 14.45%)   7 -  7(  3.11%)   9 -  9(  7.57%) 
+     65  0 -  0(  3.00%)   6 -  5(  6.09%)   8 -  7(  2.61%)  10 - 10(  4.75%) 
+    127  1 -  0( 16.12%)   9 -  8( 10.20%)  12 - 12( -0.66%)  14 - 14(  2.06%) 
+    128  1 -  1( 11.58%)   8 -  8(  2.75%)  12 - 12( -2.33%)  13 - 14( -7.63%) 
+    129  1 -  1(-48.77%)  10 - 12(-13.37%)  12 - 16(-22.85%)  14 - 22(-35.87%) 
+    191  1 -  1(-36.20%)  11 - 12( -4.61%)  13 - 18(-27.05%)  17 - 27(-39.94%) 
+    192  1 -  1(-31.62%)  11 - 12( -9.55%)  14 - 18(-18.64%)  18 - 28(-34.80%) 
+    193  1 -  2(-36.96%)  13 - 13(  0.19%)  15 - 17(-12.88%)  20 - 28(-29.62%) 
+    255  1 -  2(-35.46%)  16 - 18(-12.89%)  17 - 17(  0.23%)  22 - 28(-21.79%) 
+    256  1 -  1(  7.89%)  17 - 19(-10.33%)  17 - 18( -3.25%)  24 - 28(-16.62%) 
+    257  1 -  2(-28.10%)  16 - 19(-11.20%)  19 - 20( -6.80%)  23 - 32(-27.58%) 
+    319  1 -  2(-21.72%)  18 - 19( -6.08%)  21 - 21(  3.22%)  25 - 33(-22.94%) 
+    320  1 -  2(-23.13%)  16 - 21(-19.75%)  21 - 21(  2.12%)  26 - 33(-22.39%) 
+    321  1 -  2(-22.90%)  16 - 21(-22.21%)  21 - 20(  2.73%)  26 - 33(-22.90%) 
+    383  2 -  2(-22.35%)  19 - 20( -7.58%)  21 - 20(  0.49%)  29 - 33(-12.06%) 
+    384  2 -  2(  3.32%)  16 - 21(-22.26%)  20 - 20(  2.75%)  28 - 33(-13.58%) 
+    385  2 -  2(-36.41%)  14 - 21(-32.50%)  18 - 23(-22.20%)  27 - 35(-23.63%) 
+    447  2 -  2(  4.13%)  14 - 20(-28.61%)  16 - 23(-29.60%)  26 - 35(-23.79%) 
+    448  2 -  2(-21.37%)  14 - 22(-35.54%)  18 - 23(-21.17%)  27 - 36(-23.90%) 
+    449  2 -  2(-26.56%)  14 - 22(-36.19%)  18 - 22(-18.43%)  27 - 35(-22.43%) 
+    511  2 -  3(-31.11%)  14 - 22(-35.23%)  19 - 22(-16.50%)  29 - 35(-16.05%) 
+    512  2 -  2( -5.05%)  15 - 24(-37.63%)  19 - 22(-12.75%)  29 - 35(-15.81%) 
+    513  2 -  3(-27.14%)  15 - 24(-38.02%)  19 - 24(-20.39%)  30 - 36(-18.37%) 
+    767  3 -  4(-24.58%)  21 - 28(-26.97%)  23 - 25( -8.20%)  34 - 40(-13.70%) 
+    768  3 -  3( -0.56%)  21 - 29(-27.01%)  23 - 25( -5.67%)  34 - 39(-13.71%) 
+    769  3 -  3(-20.43%)  21 - 29(-26.40%)  23 - 27(-13.86%)  34 - 41(-15.93%) 
+   1023  5 -  5( -7.38%)  23 - 32(-27.22%)  27 - 28( -3.98%)  39 - 44(-11.72%) 
+   1024  5 -  6(-17.62%)  25 - 33(-25.40%)  27 - 28( -2.44%)  39 - 43(-11.14%) 
+   1025  5 -  4(  3.62%)  25 - 33(-25.57%)  27 - 29( -8.17%)  39 - 46(-16.26%) 
+   1518 10 - 10( -4.77%)  33 - 42(-20.47%)  36 - 34(  6.36%)  53 - 54( -2.01%) 
+   1522 10 - 11( -5.28%)  34 - 42(-18.86%)  36 - 33(  8.35%)  53 - 53( -1.57%) 
+   1536  7 -  8(-12.20%)  34 - 42(-19.11%)  39 - 33( 17.70%)  53 - 54( -0.54%) 
+   1600 11 -  9( 20.88%)  35 - 43(-18.54%)  31 - 35(-10.26%)  50 - 55( -9.91%) 
+   2048 15 -  8( 99.56%)  51 - 51(  0.24%)  40 - 39(  1.22%)  64 - 62(  3.14%) 
+   2560 17 - 16(  1.33%)  59 - 60( -0.76%)  47 - 47(  0.75%)  73 - 73( -0.56%) 
+   3072 22 - 20(  8.49%)  68 - 68(  0.32%)  53 - 54( -2.01%)  82 - 83( -0.37%) 
+   3584 30 - 32( -4.26%)  76 - 76(  0.19%)  61 - 60(  1.03%)  91 - 92( -0.92%) 
+   4096 34 - 28( 22.80%)  85 - 86( -0.61%)  67 - 67(  0.03%) 100 -100( -0.08%) 
+   4608 34 - 36( -4.01%)  93 - 93(  0.17%)  74 - 75( -0.47%) 109 -109(  0.44%) 
+   5120 35 - 29( 20.42%) 102 -102( -0.11%)  82 - 82( -0.08%) 119 -119(  0.53%) 
+   5632 44 - 41(  8.71%) 110 -110(  0.14%)  89 - 90( -0.16%) 128 -127(  0.16%) 
+   6144 40 - 48(-17.75%) 119 -119(  0.12%)  98 - 99( -0.31%) 138 -137(  0.56%) 
+   6656 53 - 54( -0.83%) 127 -127(  0.14%) 107 -107( -0.07%) 146 -145(  0.50%) 
+   7168 56 - 59( -5.16%) 136 -136(  0.18%) 115 -115( -0.13%) 155 -155( -0.34%) 
+   7680 71 - 68(  4.02%) 144 -144(  0.01%) 123 -123( -0.06%) 164 -163(  0.47%) 
+   8192 76 - 65( 17.61%) 152 -153( -0.36%) 130 -130( -0.04%) 174 -174(  0.13%) 
+------- ----------------- ----------------- ----------------- -----------------
+C     6  0 -  0(  1.10%)   1 -  1(  8.55%)   2 -  2(  0.06%)   3 -  3(  4.86%) 
+C    64  0 -  0( -3.20%)   5 -  5(  0.54%)   7 -  7(  0.27%)   9 -  9( -0.50%) 
+C   128  1 -  0( 25.53%)   9 -  8(  3.56%)  12 - 12( -3.53%)  13 - 14( -8.98%) 
+C   192  1 -  1(-37.27%)  11 - 12(-10.10%)  13 - 17(-23.33%)  17 - 28(-38.96%) 
+C   256  1 -  1(  3.35%)  17 - 19( -8.99%)  16 - 18( -7.62%)  23 - 29(-20.07%) 
+C   512  2 -  2( -6.31%)  14 - 24(-38.90%)  19 - 22(-13.61%)  29 - 35(-16.90%) 
+C   768  3 -  3( -0.45%)  21 - 29(-25.43%)  23 - 25( -6.64%)  34 - 40(-13.59%) 
+C  1024  6 -  6( -5.63%)  25 - 33(-24.23%)  27 - 28( -3.26%)  39 - 43(-10.94%) 
+C  1536  8 -  8(  3.04%)  34 - 43(-19.62%)  38 - 33( 15.48%)  53 - 53( -0.43%) 
+======= ================= ================= ================= =================
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 33307 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v1 1/2] riscv support rte_memcpy in vector
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
@ 2025-10-17  5:29   ` sunyuechi
  2025-10-17 10:10     ` chen.qiguo
  2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  1 sibling, 1 reply; 25+ messages in thread
From: sunyuechi @ 2025-10-17  5:29 UTC (permalink / raw)
  To: Qiguo Chen; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson

[-- Attachment #1: Type: text/plain, Size: 12951 bytes --]

> riscv support rte_memcpy in vector
> This patch implements RISC-V vector intrinsics


Please adjust the title and msg to mention that zicbop has been introduced, and that intrinsic is not currently being used


config/riscv/meson.build


> # detect extensions
> # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
> if (riscv_extension_macros and
>     (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
>   if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
>       or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
>       message('Compiling with the zicbop extension')
>       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
>   else
>     warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
>   endif
> endif


The implementation does not involve intrinsics


>     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) 
>     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) 
>     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) 
>     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) 
>     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) 
>     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) 
>     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 


It looks like there's a performance degradation in the 0-128 range, can you fix it?


eal/riscv/include/rte_memcpy.h


> #define ALIGNMENT_MASK_16    0xF


unused


>/*else*/


Please remove /*else*/


> static __rte_always_inline void *
> _rte_memcpy(void *dst, const void *src, size_t n)
> {
> return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
> }


No need for an extra function call; you can write the implementation directly in the function






-----原始邮件-----
发件人:"Qiguo Chen" <chen.qiguo@zte.com.cn>
发送时间:2025-10-16 17:09:33 (星期四)
收件人: stanislaw.kardach@gmail.com, sunyuechi@iscas.ac.cn, stephen@networkplumber.org
抄送: dev@dpdk.org, bruce.richardson@intel.com, "Qiguo Chen" <chen.qiguo@zte.com.cn>
主题: [PATCH v1 1/2] riscv support rte_memcpy in vector

This patch implements RISC-V vector intrinsics
to accelerate memory copy operations for byte range (129~1600).

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define(&apos;__riscv_zicbop&apos;, args: machine_args) != &apos;&apos;))
+  if ((cc.get_id() == &apos;gcc&apos; and cc.version().version_compare(&apos;>=14.1.0&apos;))
+      or (cc.get_id() == &apos;clang&apos; and cc.version().version_compare(&apos;>=18.1.0&apos;)))
+      message(&apos;Compiling with the zicbop extension&apos;)
+      machine_args += [&apos;-DRTE_RISCV_FEATURE_PREFETCH&apos;]
+  else
+    warning(&apos;Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)&apos;)
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 64(%0)\n"
+        "prefetch.w 64(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0)\n"
+        "prefetch.w 128(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0);"
+        "prefetch.w 128(%1);"
+        "prefetch.r 192(%0);"
+        "prefetch.w 192(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 32;
+    asm volatile (
+         "vsetvli t1, %2, e8, m2, ta, ma\n"
+         "vle8.v v2, (%1)\n"
+         "vse8.v v2, (%0)"
+         :: "r"(dst), "r"(src), "r"(n)
+         : "v2", "v3", "t1", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 64;
+    asm volatile (
+        "vsetvli t3, %2, e8, m4, ta, ma\n"
+        "vle8.v v8, (%1)\n"
+        "vse8.v v8, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        :  "v8", "v9", "v10", "v11", "t3", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 128;
+    asm volatile (
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "vse8.v v16, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+    memcpy_prefetch128_2(src, dst);
+    _rte_mov128(dst, src);
+    _rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+    asm volatile (
+        "prefetch.r 64(%1)\n"
+        "prefetch.w 64(%0)\n"
+        "prefetch.r 128(%1)\n"
+        "prefetch.w 128(%0)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.w 192(%0)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.w 256(%0)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.w 320(%0)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.w 384(%0)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.w 448(%0)\n"
+        "prefetch.r 512(%1)\n"
+        "li t6, 512\n"
+        "3:\n"
+        "li t5, 128;"
+        "vsetvli zero, t5, e8, m8, ta, ma\n"
+        "1:;"
+        "bgt %2, t6, 4f\n"
+        "j 2f\n"
+        "4:\n"
+        "prefetch.r 576(%1)\n"
+        "prefetch.r 640(%1)\n"
+        "2:\n"
+        "vle8.v   v16, (%1)\n"
+        "add      %1, %1, t5\n"
+        "vse8.v   v16, (%0)\n"
+        "add      %0, %0, t5\n"
+        "sub      %2, %2, t5\n"
+        "bnez     %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+    );
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "prefetch.r 128(%1)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "li t6, 640\n"
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "blt %2, t6, 3f\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "3:\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+     );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+    void *ret = dst;
+    size_t dstofss;
+    uint32_t bn;
+
+    if (n <= 384) {
+        if (n >= 256) {
+            memcpy_prefetch128_2(src, dst);
+            n -= 256;
+            _rte_mov128(dst, src);
+            _rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+            src = (const uint8_t *)src + 256;
+            dst = (uint8_t *)dst + 256;
+        }
+        if (n >= 128) {
+            memcpy_prefetch128_1(src, dst);
+            n -= 128;
+            _rte_mov128(dst, src);
+            src = (const uint8_t *)src + 128;
+            dst = (uint8_t *)dst + 128;
+        }
+
+        if (n >= 64) {
+            memcpy_prefetch64_1(src, dst);
+            n -= 64;
+            _rte_mov64(dst, src);
+            src = (const uint8_t *)src + 64;
+            dst = (uint8_t *)dst + 64;
+        }
+
+        if (n > 32) {
+            _rte_mov32(dst, src);
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+            return ret;
+        }
+
+        if (n > 0) {
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+        }
+        return ret;
+    }
+
+    /**
+     * Make store aligned when copy size exceeds 256 bytes.
+     */
+    dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+    if (dstofss > 0) {
+        dstofss = 64 - dstofss;
+        n -= dstofss;
+        _rte_mov64(dst, src);
+        src = (const uint8_t *)src + dstofss;
+        dst = (uint8_t *)dst + dstofss;
+    }
+
+    /**
+     * Copy 128-byte blocks
+     */
+    if ((uintptr_t)src & ALIGNMENT_MASK_64)    {
+        bn = n - (n & ALIGNMENT_MASK_128);
+        _rte_mov128blocks(dst, src, bn);
+        n = n & ALIGNMENT_MASK_128;
+        src = (const uint8_t *)src + bn;
+        dst = (uint8_t *)dst + bn;
+        _rte_mov(dst, src, n);
+    } else
+        _rte_mov_aligned(dst, src, n);
+
+    return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+    return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+        return _rte_memcpy(dst, src, n);
+    /*else*/
+#endif
+        return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-    memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely(memcpy_alg == MEMCPY_RISCV))
+        _rte_mov256(dst, src);
+    else
+#endif
+        memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+    long vlenb;
+    asm ("csrr %0, 0xc22" : "=r"(vlenb));
+    return vlenb;
 }
 
-#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+    long vlenb = riscv_vlenb();
+    if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+        memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #2: Type: text/html, Size: 30440 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2 0/1]  Optimization Summary for RISC-V rte_memcpy
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
  2025-10-17  5:29   ` sunyuechi
@ 2025-10-17  9:36   ` Qiguo Chen
  2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
  1 sibling, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-17  9:36 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1008 bytes --]

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V architectures,
achieving an average 10%~15% reduction in execution time for data sizes between
129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
 specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128)


Qiguo Chen (1):
  riscv support rte_memcpy in vector

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 1912 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2 1/1] riscv support rte_memcpy in vector
  2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-17  9:36     ` Qiguo Chen
  2025-10-20  9:43       ` sunyuechi
  2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  0 siblings, 2 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-17  9:36 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9845 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy(dst, src, n);
+	/*else*/
+#endif
+		return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26665 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v1 1/2] riscv support rte_memcpy in vector
  2025-10-17  5:29   ` sunyuechi
@ 2025-10-17 10:10     ` chen.qiguo
  0 siblings, 0 replies; 25+ messages in thread
From: chen.qiguo @ 2025-10-17 10:10 UTC (permalink / raw)
  To: sunyuechi; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson


[-- Attachment #1.1.1: Type: text/plain, Size: 15131 bytes --]

>     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) >     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) >     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) >     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) >     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) >     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) >     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 
It looks like there's a performance degradation in the 0-128 range, can you fix it?


For  small size copy,  we can use memcpy directly.   It seems that the judge condition causes this result. 



Original


From: sunyuechi@iscas.ac.cn <sunyuechi@iscas.ac.cn>
To: 陈其国10108961;
Cc: stanislaw.kardach@gmail.com <stanislaw.kardach@gmail.com>;stephen@networkplumber.org <stephen@networkplumber.org>;dev@dpdk.org <dev@dpdk.org>;bruce.richardson@intel.com <bruce.richardson@intel.com>;
Date: 2025年10月17日 13:29
Subject: Re: [PATCH v1 1/2] riscv support rte_memcpy in vector

> riscv support rte_memcpy in vector
 > This patch implements RISC-V vector intrinsics
 
 
 Please adjust the title and msg to mention that zicbop has been introduced, and that intrinsic is not currently being used
 
 
 config/riscv/meson.build
 
 
 > # detect extensions
 > # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
 > if (riscv_extension_macros and
 >     (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
 >   if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
 >       or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
 >       message('Compiling with the zicbop extension')
 >       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
 >   else
 >     warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
 >   endif
 > endif
 
 
 The implementation does not involve intrinsics
 
 
 >     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) 
 >     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) 
 >     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) 
 >     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) 
 >     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) 
 >     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) 
 >     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 
 
 
 It looks like there's a performance degradation in the 0-128 range, can you fix it?
 
 
 eal/riscv/include/rte_memcpy.h
 
 
 > #define ALIGNMENT_MASK_16    0xF
 
 
 unused
 
 
 >/*else*/
 
 
 Please remove /*else*/
 
 
 > static __rte_always_inline void *
 > _rte_memcpy(void *dst, const void *src, size_t n)
 > {
 > 	return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
 > }
 
 
 No need for an extra function call; you can write the implementation directly in the function
 	
 
 
 
 	-----原始邮件-----
 发件人:"Qiguo Chen" <chen.qiguo@zte.com.cn>
 发送时间:2025-10-16 17:09:33 (星期四)
 收件人: stanislaw.kardach@gmail.com, sunyuechi@iscas.ac.cn, stephen@networkplumber.org
 抄送: dev@dpdk.org, bruce.richardson@intel.com, "Qiguo Chen" <chen.qiguo@zte.com.cn>
 主题: [PATCH v1 1/2] riscv support rte_memcpy in vector
 
 This patch implements RISC-V vector intrinsics
 to accelerate memory copy operations for byte range (129~1600).
 
 Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn> 
 ---
  .mailmap                           |   1 +
  config/riscv/meson.build           |  14 ++
  lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
  3 files changed, 323 insertions(+), 2 deletions(-)
 
 diff --git a/.mailmap b/.mailmap
 index 08e5ec8560..178c5f44f4 100644
 --- a/.mailmap
 +++ b/.mailmap
 @@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com> 
  Qian Xu <qian.q.xu@intel.com> 
  Qiao Liu <qiao.liu@intel.com> 
  Qi Fu <qi.fu@intel.com> 
 +Qiguo Chen <chen.qiguo@zte.com.cn> 
  Qimai Xiao <qimaix.xiao@intel.com> 
  Qiming Chen <chenqiming_huawei@163.com> 
  Qiming Yang <qiming.yang@intel.com> 
 diff --git a/config/riscv/meson.build b/config/riscv/meson.build
 index f3daea0c0e..abba474b5e 100644
 --- a/config/riscv/meson.build
 +++ b/config/riscv/meson.build
 @@ -146,6 +146,20 @@ if (riscv_extension_macros and
      endif
  endif
   
 +# detect extensions
 +# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
 +if (riscv_extension_macros and
 +    (cc.get_define(&apos;__riscv_zicbop&apos;, args: machine_args) != &apos;&apos;))
 +  if ((cc.get_id() == &apos;gcc&apos; and cc.version().version_compare(&apos;>=14.1.0&apos;))
 +      or (cc.get_id() == &apos;clang&apos; and cc.version().version_compare(&apos;>=18.1.0&apos;)))
 +      message(&apos;Compiling with the zicbop extension&apos;)
 +      machine_args += [&apos;-DRTE_RISCV_FEATURE_PREFETCH&apos;]
 +  else
 +    warning(&apos;Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)&apos;)
 +  endif
 +endif
 +
 +
  # apply flags
  foreach flag: dpdk_flags
      if flag.length() > 0
 diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
 index d8a942c5d2..6f8cb0d4a4 100644
 --- a/lib/eal/riscv/include/rte_memcpy.h
 +++ b/lib/eal/riscv/include/rte_memcpy.h
 @@ -11,6 +11,7 @@
  #include <string.h> 
   
  #include "rte_common.h" 
 +#include <rte_branch_prediction.h> 
   
  #include "generic/rte_memcpy.h" 
   
 @@ -18,6 +19,290 @@
  extern "C" {
  #endif
   
 +
 +#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
 +#undef RTE_RISCV_FEATURE_V
 +#endif
 +
 +
 +#if defined(RTE_RISCV_FEATURE_V)
 +
 +#include "rte_cpuflags.h" 
 +
 +#define RISCV_VLENB   16
 +#define MEMCPY_GLIBC       (1U << 0)
 +#define MEMCPY_RISCV       (1U << 1)
 +#define ALIGNMENT_MASK_128   0x7F
 +#define ALIGNMENT_MASK_64    0x3F
 +#define ALIGNMENT_MASK_16    0xF
 +
 +static uint8_t memcpy_alg = MEMCPY_GLIBC;
 +
 +
 +static __rte_always_inline void
 +memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
 +{
 +    __asm__ (
 +        "prefetch.r 64(%0)\n" 
 +        "prefetch.w 64(%1)" 
 +        :: "r"(src), "r"(dst)
 +    );
 +}
 +
 +static __rte_always_inline void
 +memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
 +{
 +    __asm__ (
 +        "prefetch.r 128(%0)\n" 
 +        "prefetch.w 128(%1)" 
 +        :: "r"(src), "r"(dst)
 +    );
 +}
 +
 +static __rte_always_inline void
 +memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
 +{
 +    __asm__ (
 +        "prefetch.r 128(%0);" 
 +        "prefetch.w 128(%1);" 
 +        "prefetch.r 192(%0);" 
 +        "prefetch.w 192(%1)" 
 +        :: "r"(src), "r"(dst)
 +    );
 +}
 +
 +
 +static __rte_always_inline void
 +_rte_mov32(uint8_t *dst, const uint8_t *src)
 +{
 +    uint32_t n = 32;
 +    asm volatile (
 +         "vsetvli t1, %2, e8, m2, ta, ma\n" 
 +         "vle8.v v2, (%1)\n" 
 +         "vse8.v v2, (%0)" 
 +         :: "r"(dst), "r"(src), "r"(n)
 +         : "v2", "v3", "t1", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov64(uint8_t *dst, const uint8_t *src)
 +{
 +    uint32_t n = 64;
 +    asm volatile (
 +        "vsetvli t3, %2, e8, m4, ta, ma\n" 
 +        "vle8.v v8, (%1)\n" 
 +        "vse8.v v8, (%0)" 
 +        :: "r"(dst), "r"(src), "r"(n)
 +        :  "v8", "v9", "v10", "v11", "t3", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov128(uint8_t *dst, const uint8_t *src)
 +{
 +    uint32_t n = 128;
 +    asm volatile (
 +        "vsetvli t4, %2, e8, m8, ta, ma\n" 
 +        "vle8.v v16, (%1)\n" 
 +        "vse8.v v16, (%0)" 
 +        :: "r"(dst), "r"(src), "r"(n)
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov256(uint8_t *dst, const uint8_t *src)
 +{
 +    memcpy_prefetch128_2(src, dst);
 +    _rte_mov128(dst, src);
 +    _rte_mov128(dst + 128, src + 128);
 +}
 +
 +static __rte_always_inline void
 +_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 +{
 +    asm volatile (
 +        "prefetch.r 64(%1)\n" 
 +        "prefetch.w 64(%0)\n" 
 +        "prefetch.r 128(%1)\n" 
 +        "prefetch.w 128(%0)\n" 
 +        "prefetch.r 192(%1)\n" 
 +        "prefetch.w 192(%0)\n" 
 +        "prefetch.r 256(%1)\n" 
 +        "prefetch.w 256(%0)\n" 
 +        "prefetch.r 320(%1)\n" 
 +        "prefetch.w 320(%0)\n" 
 +        "prefetch.r 384(%1)\n" 
 +        "prefetch.w 384(%0)\n" 
 +        "prefetch.r 448(%1)\n" 
 +        "prefetch.w 448(%0)\n" 
 +        "prefetch.r 512(%1)\n" 
 +        "li t6, 512\n" 
 +        "3:\n" 
 +        "li t5, 128;" 
 +        "vsetvli zero, t5, e8, m8, ta, ma\n" 
 +        "1:;" 
 +        "bgt %2, t6, 4f\n" 
 +        "j 2f\n" 
 +        "4:\n" 
 +        "prefetch.r 576(%1)\n" 
 +        "prefetch.r 640(%1)\n" 
 +        "2:\n" 
 +        "vle8.v   v16, (%1)\n" 
 +        "add      %1, %1, t5\n" 
 +        "vse8.v   v16, (%0)\n" 
 +        "add      %0, %0, t5\n" 
 +        "sub      %2, %2, t5\n" 
 +        "bnez     %2, 1b" 
 +        : "+r"(dst), "+r"(src), "+r"(n)
 +        :
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory" 
 +    );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
 +{
 +    asm volatile (
 +        "1:\n" 
 +        "vsetvli t4, %2, e8, m8, ta, ma\n" 
 +        "vle8.v v16, (%1)\n" 
 +        "add %1, %1, t4\n" 
 +        "vse8.v v16, (%0)\n" 
 +        "add %0, %0, t4\n" 
 +        "sub %2, %2, t4\n" 
 +        "bnez %2, 1b" 
 +        : "+r"(dst), "+r"(src), "+r"(n)
 +        :
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
 +{
 +    asm volatile (
 +        "prefetch.r 128(%1)\n" 
 +        "prefetch.r 192(%1)\n" 
 +        "prefetch.r 256(%1)\n" 
 +        "prefetch.r 320(%1)\n" 
 +        "prefetch.r 384(%1)\n" 
 +        "prefetch.r 448(%1)\n" 
 +        "prefetch.r 512(%1)\n" 
 +        "prefetch.r 576(%1)\n" 
 +        "li t6, 640\n" 
 +        "1:\n" 
 +        "vsetvli t4, %2, e8, m8, ta, ma\n" 
 +        "vle8.v v16, (%1)\n" 
 +        "add %1, %1, t4\n" 
 +        "vse8.v v16, (%0)\n" 
 +        "add %0, %0, t4\n" 
 +        "sub %2, %2, t4\n" 
 +        "blt %2, t6, 3f\n" 
 +        "prefetch.r 512(%1)\n" 
 +        "prefetch.r 576(%1)\n" 
 +        "3:\n" 
 +        "bnez %2, 1b" 
 +        : "+r"(dst), "+r"(src), "+r"(n)
 +        :
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void *
 +_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
 +{
 +    void *ret = dst;
 +    size_t dstofss;
 +    uint32_t bn;
 +
 +    if (n <= 384) {
 +        if (n >= 256) {
 +            memcpy_prefetch128_2(src, dst);
 +            n -= 256;
 +            _rte_mov128(dst, src);
 +            _rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 +            src = (const uint8_t *)src + 256;
 +            dst = (uint8_t *)dst + 256;
 +        }
 +        if (n >= 128) {
 +            memcpy_prefetch128_1(src, dst);
 +            n -= 128;
 +            _rte_mov128(dst, src);
 +            src = (const uint8_t *)src + 128;
 +            dst = (uint8_t *)dst + 128;
 +        }
 +
 +        if (n >= 64) {
 +            memcpy_prefetch64_1(src, dst);
 +            n -= 64;
 +            _rte_mov64(dst, src);
 +            src = (const uint8_t *)src + 64;
 +            dst = (uint8_t *)dst + 64;
 +        }
 +
 +        if (n > 32) {
 +            _rte_mov32(dst, src);
 +            _rte_mov32((uint8_t *)dst - 32 + n,
 +                    (const uint8_t *)src - 32 + n);
 +            return ret;
 +        }
 +
 +        if (n > 0) {
 +            _rte_mov32((uint8_t *)dst - 32 + n,
 +                    (const uint8_t *)src - 32 + n);
 +        }
 +        return ret;
 +    }
 +
 +    /**
 +     * Make store aligned when copy size exceeds 256 bytes.
 +     */
 +    dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
 +    if (dstofss > 0) {
 +        dstofss = 64 - dstofss;
 +        n -= dstofss;
 +        _rte_mov64(dst, src);
 +        src = (const uint8_t *)src + dstofss;
 +        dst = (uint8_t *)dst + dstofss;
 +    }
 +
 +    /**
 +     * Copy 128-byte blocks
 +     */
 +    if ((uintptr_t)src & ALIGNMENT_MASK_64)    {
 +        bn = n - (n & ALIGNMENT_MASK_128);
 +        _rte_mov128blocks(dst, src, bn);
 +        n = n & ALIGNMENT_MASK_128;
 +        src = (const uint8_t *)src + bn;
 +        dst = (uint8_t *)dst + bn;
 +        _rte_mov(dst, src, n);
 +    } else
 +        _rte_mov_aligned(dst, src, n);
 +
 +    return ret;
 +}
 +
 +static __rte_always_inline void *
 +_rte_memcpy(void *dst, const void *src, size_t n)
 +{
 +    return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
 +}
 +#endif
 +
 +/*----------------------api---------------------------------------------------*/
 +static __rte_always_inline void *
 +rte_memcpy(void *dst, const void *src, size_t n)
 +{
 +#if defined(RTE_RISCV_FEATURE_V)
 +    if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
 +        return _rte_memcpy(dst, src, n);
 +    /*else*/
 +#endif
 +        return memcpy(dst, src, n);
 +}
 +
  static inline void
  rte_mov16(uint8_t *dst, const uint8_t *src)
  {
 @@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
  static inline void
  rte_mov256(uint8_t *dst, const uint8_t *src)
  {
 -    memcpy(dst, src, 256);
 +#if defined(RTE_RISCV_FEATURE_V)
 +    if (likely(memcpy_alg == MEMCPY_RISCV))
 +        _rte_mov256(dst, src);
 +    else
 +#endif
 +        memcpy(dst, src, 256);
 +}
 +/*----------------------------------------------------------------------------*/
 +#if defined(RTE_RISCV_FEATURE_V)
 +static inline long
 +riscv_vlenb(void)
 +{
 +    long vlenb;
 +    asm ("csrr %0, 0xc22" : "=r"(vlenb));
 +    return vlenb;
  }
   
 -#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
 +RTE_INIT(rte_vect_memcpy_init)
 +{
 +    long vlenb = riscv_vlenb();
 +    if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
 +        memcpy_alg = MEMCPY_RISCV;
 +}
 +#endif
 +
   
  #ifdef __cplusplus
  }
 --  
 2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 34249 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/1] riscv support rte_memcpy in vector
  2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
@ 2025-10-20  9:43       ` sunyuechi
  2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  1 sibling, 0 replies; 25+ messages in thread
From: sunyuechi @ 2025-10-20  9:43 UTC (permalink / raw)
  To: Qiguo Chen; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson

[-- Attachment #1: Type: text/plain, Size: 11146 bytes --]

It looks like the commit message has changed, but it seems the commit title and code files haven't changed. Was the wrong version committed?


-----原始邮件-----
发件人:"Qiguo Chen" <chen.qiguo@zte.com.cn>
发送时间:2025-10-17 17:36:17 (星期五)
收件人: sunyuechi@iscas.ac.cn, stanislaw.kardach@gmail.com, stephen@networkplumber.org
抄送: dev@dpdk.org, bruce.richardson@intel.com, "Qiguo Chen" <chen.qiguo@zte.com.cn>
主题: [PATCH v2 1/1] riscv support rte_memcpy in vector

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define(&apos;__riscv_zicbop&apos;, args: machine_args) != &apos;&apos;))
+  if ((cc.get_id() == &apos;gcc&apos; and cc.version().version_compare(&apos;>=14.1.0&apos;))
+      or (cc.get_id() == &apos;clang&apos; and cc.version().version_compare(&apos;>=18.1.0&apos;)))
+      message(&apos;Compiling with the zicbop extension&apos;)
+      machine_args += [&apos;-DRTE_RISCV_FEATURE_PREFETCH&apos;]
+  else
+    warning(&apos;Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)&apos;)
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 64(%0)\n"
+        "prefetch.w 64(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0)\n"
+        "prefetch.w 128(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0);"
+        "prefetch.w 128(%1);"
+        "prefetch.r 192(%0);"
+        "prefetch.w 192(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 32;
+    asm volatile (
+         "vsetvli t1, %2, e8, m2, ta, ma\n"
+         "vle8.v v2, (%1)\n"
+         "vse8.v v2, (%0)"
+         :: "r"(dst), "r"(src), "r"(n)
+         : "v2", "v3", "t1", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 64;
+    asm volatile (
+        "vsetvli t3, %2, e8, m4, ta, ma\n"
+        "vle8.v v8, (%1)\n"
+        "vse8.v v8, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        :  "v8", "v9", "v10", "v11", "t3", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 128;
+    asm volatile (
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "vse8.v v16, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+    memcpy_prefetch128_2(src, dst);
+    _rte_mov128(dst, src);
+    _rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+    asm volatile (
+        "prefetch.r 64(%1)\n"
+        "prefetch.w 64(%0)\n"
+        "prefetch.r 128(%1)\n"
+        "prefetch.w 128(%0)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.w 192(%0)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.w 256(%0)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.w 320(%0)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.w 384(%0)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.w 448(%0)\n"
+        "prefetch.r 512(%1)\n"
+        "li t6, 512\n"
+        "3:\n"
+        "li t5, 128;"
+        "vsetvli zero, t5, e8, m8, ta, ma\n"
+        "1:;"
+        "bgt %2, t6, 4f\n"
+        "j 2f\n"
+        "4:\n"
+        "prefetch.r 576(%1)\n"
+        "prefetch.r 640(%1)\n"
+        "2:\n"
+        "vle8.v   v16, (%1)\n"
+        "add      %1, %1, t5\n"
+        "vse8.v   v16, (%0)\n"
+        "add      %0, %0, t5\n"
+        "sub      %2, %2, t5\n"
+        "bnez     %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+    );
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "prefetch.r 128(%1)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "li t6, 640\n"
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "blt %2, t6, 3f\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "3:\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+     );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+    void *ret = dst;
+    size_t dstofss;
+    uint32_t bn;
+
+    if (n <= 384) {
+        if (n >= 256) {
+            memcpy_prefetch128_2(src, dst);
+            n -= 256;
+            _rte_mov128(dst, src);
+            _rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+            src = (const uint8_t *)src + 256;
+            dst = (uint8_t *)dst + 256;
+        }
+        if (n >= 128) {
+            memcpy_prefetch128_1(src, dst);
+            n -= 128;
+            _rte_mov128(dst, src);
+            src = (const uint8_t *)src + 128;
+            dst = (uint8_t *)dst + 128;
+        }
+
+        if (n >= 64) {
+            memcpy_prefetch64_1(src, dst);
+            n -= 64;
+            _rte_mov64(dst, src);
+            src = (const uint8_t *)src + 64;
+            dst = (uint8_t *)dst + 64;
+        }
+
+        if (n > 32) {
+            _rte_mov32(dst, src);
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+            return ret;
+        }
+
+        if (n > 0) {
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+        }
+        return ret;
+    }
+
+    /**
+     * Make store aligned when copy size exceeds 256 bytes.
+     */
+    dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+    if (dstofss > 0) {
+        dstofss = 64 - dstofss;
+        n -= dstofss;
+        _rte_mov64(dst, src);
+        src = (const uint8_t *)src + dstofss;
+        dst = (uint8_t *)dst + dstofss;
+    }
+
+    /**
+     * Copy 128-byte blocks
+     */
+    if ((uintptr_t)src & ALIGNMENT_MASK_64)    {
+        bn = n - (n & ALIGNMENT_MASK_128);
+        _rte_mov128blocks(dst, src, bn);
+        n = n & ALIGNMENT_MASK_128;
+        src = (const uint8_t *)src + bn;
+        dst = (uint8_t *)dst + bn;
+        _rte_mov(dst, src, n);
+    } else
+        _rte_mov_aligned(dst, src, n);
+
+    return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+    return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+        return _rte_memcpy(dst, src, n);
+    /*else*/
+#endif
+        return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-    memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely(memcpy_alg == MEMCPY_RISCV))
+        _rte_mov256(dst, src);
+    else
+#endif
+        memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+    long vlenb;
+    asm ("csrr %0, 0xc22" : "=r"(vlenb));
+    return vlenb;
 }
 
-#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+    long vlenb = riscv_vlenb();
+    if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+        memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #2: Type: text/html, Size: 25605 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
  2025-10-20  9:43       ` sunyuechi
@ 2025-10-20 12:08       ` Qiguo Chen
  2025-10-20 12:08         ` [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions Qiguo Chen
  1 sibling, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-20 12:08 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1201 bytes --]

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128)

Qiguo Chen (1):
  lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop
    extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2339 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions
  2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-20 12:08         ` Qiguo Chen
  2025-10-21  6:56           ` [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  0 siblings, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-20 12:08 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9649 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..fb817e5f43 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,283 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26287 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-20 12:08         ` [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions Qiguo Chen
@ 2025-10-21  6:56           ` Qiguo Chen
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  0 siblings, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-21  6:56 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1233 bytes --]

Changes in v4:
     1)rebase code only.

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128) 


Qiguo Chen (1):
  eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2416 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-21  6:56           ` [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-21  6:56             ` Qiguo Chen
  2025-10-24  2:56               ` retest Qiguo Chen
                                 ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-21  6:56 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9649 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3817bf7cdb..85f50bce87 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1283,6 +1283,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..fb817e5f43 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,283 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26287 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
@ 2025-10-24  2:56               ` Qiguo Chen
  2025-10-30 15:35                 ` retest Stephen Hemminger
  2025-10-24  3:04               ` retest Qiguo Chen
                                 ` (3 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  2:56 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1: Type: multipart/alternative, Size: 1 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24  2:56               ` retest Qiguo Chen
@ 2025-10-24  3:04               ` Qiguo Chen
  2025-10-24  3:12               ` retest Qiguo Chen
                                 ` (2 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  3:04 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1.1: Type: text/plain, Size: 6 bytes --]

retest

[-- Attachment #1.1.2: Type: text/html , Size: 12 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24  2:56               ` retest Qiguo Chen
  2025-10-24  3:04               ` retest Qiguo Chen
@ 2025-10-24  3:12               ` Qiguo Chen
  2025-10-24  5:04               ` retest Qiguo Chen
  2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  4 siblings, 0 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  3:12 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1.1: Type: text/plain, Size: 6 bytes --]

retest

[-- Attachment #1.1.2: Type: text/html , Size: 12 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
                                 ` (2 preceding siblings ...)
  2025-10-24  3:12               ` retest Qiguo Chen
@ 2025-10-24  5:04               ` Qiguo Chen
  2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  4 siblings, 0 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  5:04 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1.1: Type: text/plain, Size: 9 bytes --]

ci retest

[-- Attachment #1.1.2: Type: text/html , Size: 20 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
                                 ` (3 preceding siblings ...)
  2025-10-24  5:04               ` retest Qiguo Chen
@ 2025-10-24  5:41               ` Qiguo Chen
  2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  4 siblings, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  5:41 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1276 bytes --]

Changes in v5:
     1)to trig ci only.

Changes in v4:
     1)rebase code only.

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128) 


Qiguo Chen (1):
  eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2522 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-24  5:41                 ` Qiguo Chen
  2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
  0 siblings, 2 replies; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  5:41 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9649 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3817bf7cdb..85f50bce87 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1283,6 +1283,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..fb817e5f43 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,283 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26287 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
@ 2025-10-24  7:27                   ` Qiguo Chen
  2025-10-24  7:27                     ` [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
  1 sibling, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  7:27 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1325 bytes --]

Changes in v6:
     1)solve .mailmap conflict.

Changes in v5:
     1)to trig ci only.

Changes in v4:
     1)rebase code only.

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128) 

Qiguo Chen (1):
  eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2624 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-24  7:27                     ` Qiguo Chen
  2025-11-17  4:19                       ` sunyuechi
  0 siblings, 1 reply; 25+ messages in thread
From: Qiguo Chen @ 2025-10-24  7:27 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9698 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index e4d0590451..8fcdc518f9 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1291,6 +1291,7 @@ Qi Zhang <qi.z.zhang@intel.com>
 Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..1be3ad748a 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,13 +11,291 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+#if defined(RTE_RISCV_FEATURE_V)
+#include "rte_cpuflags.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#if defined(RTE_RISCV_FEATURE_V)
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26336 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-24 16:27                   ` Stephen Hemminger
  2025-11-17  4:11                     ` sunyuechi
  1 sibling, 1 reply; 25+ messages in thread
From: Stephen Hemminger @ 2025-10-24 16:27 UTC (permalink / raw)
  To: Qiguo Chen; +Cc: sunyuechi, stanislaw.kardach, dev, bruce.richardson

On Fri, 24 Oct 2025 13:41:28 +0800
Qiguo Chen <chen.qiguo@zte.com.cn> wrote:

> This patch uses RISC-V vector instructions and zicbop prefetching to
> optimize memory copies for 129~1600 byte ranges.
> 
> Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
> ---

Is there any possibility of build environment being newer than
the run time? The Intel builds already have to deal with that problem.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: retest
  2025-10-24  2:56               ` retest Qiguo Chen
@ 2025-10-30 15:35                 ` Stephen Hemminger
  0 siblings, 0 replies; 25+ messages in thread
From: Stephen Hemminger @ 2025-10-30 15:35 UTC (permalink / raw)
  To: dev

On Fri, 24 Oct 2025 10:56:47 +0800
Qiguo Chen <chen.qiguo@zte.com.cn> wrote:


> Currently, various testing labs perform CI testing on new patch series sent
> to dev at dpdk.org and report their results to
> https://patchwork.dpdk.org/project/dpdk/list/. On each series on the patch
> list, the results appear in the test category contexts for IOL (community
> lab), GitHub, and LoongSon.
> 
> If a reported failure on a series seems suspicious to the patch submitter
> or maintainer, then there may be an interest in requesting a retest on the
> series for the failing label(s) in order to verify the failure is not
> spurious or a false positive. This retest demonstrates to the submitter or
> maintainer that the failure can be reliably reproduced. Unfortunately, at
> present, the best way to accomplish this is to reach out to lab maintainers
> via email or Slack. This is not ideal for developers in need of quick test
> results.
> 
> Going forward, CI testing labs will be implementing the option to request
> retest for their respective test labels on patchwork via emails sent to the
> dev mailing list. This feature is ready today for labels reported by the
> UNH-IOL Community Lab, and will soon also be an option for the Github Robot
> at least.
> 
> In order to request a retest on your patch series, send an email reply to
> one of your series’s patch or cover letter emails with email content of the
> format used below:
> 
> Recheck-request: <test names>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
@ 2025-11-17  4:11                     ` sunyuechi
  0 siblings, 0 replies; 25+ messages in thread
From: sunyuechi @ 2025-11-17  4:11 UTC (permalink / raw)
  To: Stephen Hemminger, Qiguo Chen; +Cc: stanislaw.kardach, dev, bruce.richardson


>> This patch uses RISC-V vector instructions and zicbop prefetching to
>> optimize memory copies for 129~1600 byte ranges.
>>
>> Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
>> ---
> Is there any possibility of build environment being newer than
> the run time? The Intel builds already have to deal with that problem.

Yes, it's best to use hwprobe for runtime detection, but the hwprobe 
detection for the zicbop extension hasn't been merged into the kernel yet.
I'd like to refer to how other projects handle RISC-V integration; 
disabling it by default at compile time should also be acceptable.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  7:27                     ` [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
@ 2025-11-17  4:19                       ` sunyuechi
  2025-11-17  9:12                         ` chen.qiguo
  0 siblings, 1 reply; 25+ messages in thread
From: sunyuechi @ 2025-11-17  4:19 UTC (permalink / raw)
  To: Qiguo Chen, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson

 > > # detect extensions
 > > # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
 > > if (riscv_extension_macros and
 > >     (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
 > >   if ((cc.get_id() == 'gcc' and 
cc.version().version_compare('>=14.1.0'))
 > >       or (cc.get_id() == 'clang' and 
cc.version().version_compare('>=18.1.0')))
 > >       message('Compiling with the zicbop extension')
 > >       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
 > >   else
 > >     warning('Detected zicbop extension but cannot use because 
intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
 > >   endif
 > > endif
 >
 > The implementation does not involve intrinsics

It looks like nothing has been changed here yet.

 > #if defined(RTE_RISCV_FEATURE_V) && 
!(defined(RTE_RISCV_FEATURE_PREFETCH))
 > #undef RTE_RISCV_FEATURE_V
 > #endif
 >
 > static __rte_always_inline void
 > _rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 > {
 >     asm volatile (
 >         "prefetch.r 64(%1)\n"
 >         "prefetch.w 64(%0)\n"
 >         "prefetch.r 128(%1)\n"
 >         "prefetch.w 128(%0)\n"
 >         "prefetch.r 192(%1)\n"
 >         "prefetch.w 192(%0)\n"
 >         "prefetch.r 256(%1)\n"
 >         "prefetch.w 256(%0)\n"
 >         "prefetch.r 320(%1)\n"
 >         "prefetch.w 320(%0)\n"
 >         "prefetch.r 384(%1)\n"
 >         "prefetch.w 384(%0)\n"
 >         "prefetch.r 448(%1)\n"
 >         "prefetch.w 448(%0)\n"
 >         "prefetch.r 512(%1)\n"
 >         "li t6, 512\n"
 >         "3:\n"
 >         "li t5, 128;"
 >         "vsetvli zero, t5, e8, m8, ta, ma\n"

With the current compilation conditions, if zicbop isn’t supported, the 
v-optimization also won’t be compiled.
Have you tested the performance difference if you remove these 
prefetches and only use v?
Can we use a condition like this to support only v?

#if defined(RTE_RISCV_FEATURE_V)
    #if (defined(RTE_RISCV_FEATURE_PREFETCH))
         ...
    #endif
     ...
#endif


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-11-17  4:19                       ` sunyuechi
@ 2025-11-17  9:12                         ` chen.qiguo
  0 siblings, 0 replies; 25+ messages in thread
From: chen.qiguo @ 2025-11-17  9:12 UTC (permalink / raw)
  To: sunyuechi; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson


[-- Attachment #1.1.1: Type: text/plain, Size: 3900 bytes --]

> > # detect extensions > > # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+ > > if (riscv_extension_macros and > >     (cc.get_define('__riscv_zicbop', args: machine_args) != '')) > >   if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0')) > >       or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0'))) > >       message('Compiling with the zicbop extension') > >       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH'] > >   else > >     warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)') > >   endif > > endif > > The implementation does not involve intrinsicsIt looks like nothing has been changed here yet.--------------sorry, i did not notice this.  i'll revise it later. 

With the current compilation conditions, if zicbop isn’t supported, the v-optimization also won’t be compiled.Have you tested the performance difference if you remove these prefetches and only use v?                                 ----------------yes.  when we use  vector  but without zicbop, the performance is worse than memcpy.Can we use a condition like this to support only v?---------Since the code affects several areas and for the reason mentioned above, I prefer to keep the current logic, as it looks simpler.

Thanks again for your review.

Original


From: sunyuechi <sunyuechi@iscas.ac.cn>
To: 陈其国10108961;stanislaw.kardach@gmail.com <stanislaw.kardach@gmail.com>;stephen@networkplumber.org <stephen@networkplumber.org>;
Cc: dev@dpdk.org <dev@dpdk.org>;bruce.richardson@intel.com <bruce.richardson@intel.com>;
Date: 2025年11月17日 12:19
Subject: Re: [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 > > # detect extensions
 > > # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
 > > if (riscv_extension_macros and
 > >     (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
 > >   if ((cc.get_id() == 'gcc' and  
cc.version().version_compare('>=14.1.0'))
 > >       or (cc.get_id() == 'clang' and  
cc.version().version_compare('>=18.1.0')))
 > >       message('Compiling with the zicbop extension')
 > >       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
 > >   else
 > >     warning('Detected zicbop extension but cannot use because  
intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
 > >   endif
 > > endif
 > 
 > The implementation does not involve intrinsics
 
It looks like nothing has been changed here yet.
 
 > #if defined(RTE_RISCV_FEATURE_V) &&  
!(defined(RTE_RISCV_FEATURE_PREFETCH))
 > #undef RTE_RISCV_FEATURE_V
 > #endif
 > 
 > static __rte_always_inline void
 > _rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 > {
 >     asm volatile (
 >         "prefetch.r 64(%1)\n" 
 >         "prefetch.w 64(%0)\n" 
 >         "prefetch.r 128(%1)\n" 
 >         "prefetch.w 128(%0)\n" 
 >         "prefetch.r 192(%1)\n" 
 >         "prefetch.w 192(%0)\n" 
 >         "prefetch.r 256(%1)\n" 
 >         "prefetch.w 256(%0)\n" 
 >         "prefetch.r 320(%1)\n" 
 >         "prefetch.w 320(%0)\n" 
 >         "prefetch.r 384(%1)\n" 
 >         "prefetch.w 384(%0)\n" 
 >         "prefetch.r 448(%1)\n" 
 >         "prefetch.w 448(%0)\n" 
 >         "prefetch.r 512(%1)\n" 
 >         "li t6, 512\n" 
 >         "3:\n" 
 >         "li t5, 128;" 
 >         "vsetvli zero, t5, e8, m8, ta, ma\n" 
 
With the current compilation conditions, if zicbop isn’t supported, the  
v-optimization also won’t be compiled.
Have you tested the performance difference if you remove these  
prefetches and only use v?
Can we use a condition like this to support only v?
 
#if defined(RTE_RISCV_FEATURE_V)
    #if (defined(RTE_RISCV_FEATURE_PREFETCH))
         ...
    #endif
     ...
#endif

[-- Attachment #1.1.2: Type: text/html , Size: 9845 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2025-11-17  9:13 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-16  9:09 [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
2025-10-17  5:29   ` sunyuechi
2025-10-17 10:10     ` chen.qiguo
2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
2025-10-20  9:43       ` sunyuechi
2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-20 12:08         ` [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions Qiguo Chen
2025-10-21  6:56           ` [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
2025-10-24  2:56               ` retest Qiguo Chen
2025-10-30 15:35                 ` retest Stephen Hemminger
2025-10-24  3:04               ` retest Qiguo Chen
2025-10-24  3:12               ` retest Qiguo Chen
2025-10-24  5:04               ` retest Qiguo Chen
2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-24  7:27                     ` [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
2025-11-17  4:19                       ` sunyuechi
2025-11-17  9:12                         ` chen.qiguo
2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
2025-11-17  4:11                     ` sunyuechi
2025-10-16  9:09 ` [PATCH v1 2/2] benchmark report for rte_memcpy Qiguo Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).