[dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal
@ 2021-08-23  8:44 Aman Kumar
  2021-08-23  8:44 ` [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms Aman Kumar
                   ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Aman Kumar @ 2021-08-23  8:44 UTC (permalink / raw)
  To: dev
  Cc: rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, keesang.song, aman.kumar

This patch provides rte_memcpy* calls optimized for
AMD EPYC Gen2 platforms. This option is disabled by
default and can be enabled by defining 'rte_memcpy_amdepyc2'
in the meson build.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 lib/eal/x86/include/meson.build  |   1 +
 lib/eal/x86/include/rte_memcpy.h | 502 +++++++++++++++++++++++++++++++
 meson_options.txt                |   2 +
 3 files changed, 505 insertions(+)

diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 12c2e00035..a03683779d 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -27,3 +27,4 @@ arch_indirect_headers = files(
 )
 install_headers(arch_headers + arch_indirect_headers, subdir: get_option('include_subdir_arch'))
 dpdk_chkinc_headers += arch_headers
+dpdk_conf.set('RTE_MEMCPY_AMDEPYC2', get_option('rte_memcpy_amdepyc2'))
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 79f381dd9b..47dda9cb87 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -368,6 +368,498 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+#if defined RTE_MEMCPY_AMDEPYC2
+
+/**
+ * Copy 16 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy16_ts(uint8_t *dst, uint8_t *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy32_ts(uint8_t *dst, uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy64_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy128_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+	rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
+	rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
+}
+
+/**
+ * Copy len bytes from one location to another,
+ * with temporal stores 16B aligned
+ */
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+
+static __rte_always_inline void *
+rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
+					    const void *src,
+					    size_t size)
+{
+	asm volatile goto("movq %0, %%rsi\n\t"
+	"movq %1, %%rdi\n\t"
+	"movq %2, %%rdx\n\t"
+	"cmpq   $(128), %%rdx\n\t"
+	"jb     202f\n\t"
+	"201:\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
+	"vmovntdqa 64(%%rsi), %%ymm2\n\t"
+	"vmovntdqa 96(%%rsi), %%ymm3\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu  %%ymm2, 64(%%rdi)\n\t"
+	"vmovdqu  %%ymm3, 96(%%rdi)\n\t"
+	"addq   $128, %%rsi\n\t"
+	"addq   $128, %%rdi\n\t"
+	"subq   $128, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
+	"jae    201b\n\t"
+	"202:\n\t"
+	"cmpq   $64, %%rdx\n\t"
+	"jb     203f\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
+	"addq   $64, %%rsi\n\t"
+	"addq   $64, %%rdi\n\t"
+	"subq   $64, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"203:\n\t"
+	"cmpq   $32, %%rdx\n\t"
+	"jb     204f\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"addq   $32, %%rsi\n\t"
+	"addq   $32, %%rdi\n\t"
+	"subq   $32, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"204:\n\t"
+	"cmpb   $16, %%dl\n\t"
+	"jb     205f\n\t"
+	"vmovntdqa (%%rsi), %%xmm0\n\t"
+	"vmovdqu  %%xmm0, (%%rdi)\n\t"
+	"addq   $16, %%rsi\n\t"
+	"addq   $16, %%rdi\n\t"
+	"subq   $16, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"205:\n\t"
+	"cmpb   $2, %%dl\n\t"
+	"jb     208f\n\t"
+	"cmpb   $4, %%dl\n\t"
+	"jbe    207f\n\t"
+	"cmpb   $8, %%dl\n\t"
+	"jbe    206f\n\t"
+	"movq   -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq   (%%rsi), %%rsi\n\t"
+	"movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq   %%rsi, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"206:\n\t"
+	"movl   -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl   (%%rsi), %%esi\n\t"
+	"movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl   %%esi, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"207:\n\t"
+	"movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl (%%rsi), %%esi\n\t"
+	"movw   %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw   %%si, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"208:\n\t"
+	"movzbl (%%rsi), %%ecx\n\t"
+	"movb   %%cl, (%%rdi)"
+	:
+	: "r"(src), "r"(dst), "r"(size)
+	: "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
+	: done
+	);
+done:
+	return dst;
+}
+
+static __rte_always_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t len)
+{
+	asm goto("movq	%0, %%rsi\n\t"
+	"movq	%1, %%rdi\n\t"
+	"movq	%2, %%rdx\n\t"
+	"movq    %%rdi, %%rax\n\t"
+	"cmp     $32, %%rdx\n\t"
+	"jb      101f\n\t"
+	"cmp     $(32 * 2), %%rdx\n\t"
+	"ja      108f\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"101:\n\t"
+	/* Less than 1 VEC.  */
+	"cmpb    $32, %%dl\n\t"
+	"jae     103f\n\t"
+	"cmpb    $16, %%dl\n\t"
+	"jae     104f\n\t"
+	"cmpb    $8, %%dl\n\t"
+	"jae     105f\n\t"
+	"cmpb    $4, %%dl\n\t"
+	"jae     106f\n\t"
+	"cmpb    $1, %%dl\n\t"
+	"ja      107f\n\t"
+	"jb      102f\n\t"
+	"movzbl  (%%rsi), %%ecx\n\t"
+	"movb    %%cl, (%%rdi)\n\t"
+	"102:\n\t"
+	"jmp %l[done]\n\t"
+	"103:\n\t"
+	/* From 32 to 63.  No branch when size == 32.  */
+	"vmovdqu (%%rsi), %%ymm0\n\t"
+	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu %%ymm0, (%%rdi)\n\t"
+	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	/* From 16 to 31.  No branch when size == 16.  */
+	"104:\n\t"
+	"vmovdqu (%%rsi), %%xmm0\n\t"
+	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
+	"vmovdqu %%xmm0, (%%rdi)\n\t"
+	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
+	"jmp %l[done]\n\t"
+	"105:\n\t"
+	/* From 8 to 15.  No branch when size == 8.  */
+	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq    (%%rsi), %%rsi\n\t"
+	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq    %%rsi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"106:\n\t"
+	/* From 4 to 7.  No branch when size == 4.  */
+	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl    (%%rsi), %%esi\n\t"
+	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl    %%esi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"107:\n\t"
+	/* From 2 to 3.  No branch when size == 2.  */
+	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl  (%%rsi), %%esi\n\t"
+	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw    %%si, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"108:\n\t"
+	/* More than 2 * VEC and there may be overlap between destination */
+	/* and source.  */
+	"cmpq    $(32 * 8), %%rdx\n\t"
+	"ja      111f\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"jb      109f\n\t"
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"109:\n\t"
+	/* Copy from 2 * VEC to 4 * VEC. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"110:\n\t"
+	"jmp %l[done]\n\t"
+	"111:\n\t"
+	"cmpq    %%rsi, %%rdi\n\t"
+	"ja      113f\n\t"
+	/* Source == destination is less common.  */
+	"je      110b\n\t"
+	/* Load the first VEC and last 4 * VEC to
+	 * support overlapping addresses.
+	 */
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
+	/* Save start and stop of the destination buffer.  */
+	"movq    %%rdi, %%r11\n\t"
+	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
+	/* Align destination for aligned stores in the loop.  Compute */
+	/* how much destination is misaligned.  */
+	"movq    %%rdi, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Get the negative of offset for alignment.  */
+	"subq    $32, %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rsi\n\t"
+	/* Adjust destination which should be aligned now.  */
+	"subq    %%r8, %%rdi\n\t"
+	/* Adjust length.  */
+	"addq    %%r8, %%rdx\n\t"
+	/* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      115f\n\t"
+	"112:\n\t"
+	/* Copy 4 * VEC a time forward.  */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32 * 4), %%rsi\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%rdi)\n\t"
+	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32 * 4), %%rdi\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      112b\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"113:\n\t"
+	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   32(%%rsi), %%ymm5\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
+	/* Save stop of the destination buffer.  */
+	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
+	/* Align destination end for aligned stores in the loop.  Compute */
+	/* how much destination end is misaligned.  */
+	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
+	"movq    %%r11, %%r9\n\t"
+	"movq    %%r11, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rcx\n\t"
+	/* Adjust the end of destination which should be aligned now.  */
+	"subq    %%r8, %%r9\n\t"
+	/* Adjust length.  */
+	"subq    %%r8, %%rdx\n\t"
+	 /* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      117f\n\t"
+	"114:\n\t"
+	/* Copy 4 * VEC a time backward.  */
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32 * 4), %%rcx\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%r9)\n\t"
+	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
+	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      114b\n\t"
+	/* Store the first 4 * VEC. */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC. */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+
+	"115:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded. */
+	"leaq    (%%rdi, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%rsi\n\t"
+	"jb      112b\n\t"
+	"116:\n\t"
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	"prefetcht0 (32*4*2)(%%rsi)\n\t"
+	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32*4), %%rsi\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%rdi)\n\t"
+	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
+	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32*4), %%rdi\n\t"
+	"cmpq    $(32*4), %%rdx\n\t"
+	"ja      116b\n\t"
+	"sfence\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"117:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded.  */
+	"leaq    (%%rcx, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%r9\n\t"
+	"jb      114b\n\t"
+	"118:\n\t"
+	/* Copy 4 * VEC a time backward with non-temporal stores. */
+	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32*4), %%rcx\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%r9)\n\t"
+	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
+	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      118b\n\t"
+	"sfence\n\t"
+	/* Store the first 4 * VEC.  */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC.  */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]"
+	:
+	: "r"(src), "r"(dst), "r"(len)
+	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
+	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
+	: done
+	);
+done:
+	return dst;
+}
+
+#else
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
@@ -479,6 +971,8 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK31;
 }
 
+#endif /* RTE_MEMCPY_AMDEPYC2 */
+
 #else /* __AVX512F__ */
 
 #define ALIGNMENT_MASK 0x0F
@@ -874,6 +1368,14 @@ rte_memcpy(void *dst, const void *src, size_t n)
 		return rte_memcpy_generic(dst, src, n);
 }
 
+#if defined __AVX2__ && defined(RTE_MEMCPY_AMDEPYC2)
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
+{
+	return rte_memcpy_aligned_ntload_tstore16_amdepyc2(dst, src, len);
+}
+#endif
+
 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 #pragma GCC diagnostic pop
 #endif
diff --git a/meson_options.txt b/meson_options.txt
index 0e92734c49..e232c9c340 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -42,6 +42,8 @@ option('platform', type: 'string', value: 'native', description:
        'Platform to build, either "native", "generic" or a SoC. Please refer to the Linux build guide for more information.')
 option('enable_trace_fp', type: 'boolean', value: false, description:
        'enable fast path trace points.')
+option('rte_memcpy_amdepyc2', type: 'boolean', value: false, description:
+       'to enable amd epyc memcpy routines')
 option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms
  2021-08-23  8:44 [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Aman Kumar
@ 2021-08-23  8:44 ` Aman Kumar
  2021-10-13 16:53   ` Thomas Monjalon
  2021-08-23 15:21 ` [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Jerin Jacob
  2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
  2 siblings, 1 reply; 43+ messages in thread
From: Aman Kumar @ 2021-08-23  8:44 UTC (permalink / raw)
  To: dev
  Cc: rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, keesang.song, aman.kumar

add non temporal load and temporal store for mprq memcpy.
define mlx5_ntload_tstore in meson build configuration to
enable this optimization. This utilizes AMD EPYC2 optimized
rte_memcpy* routines.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build |  1 +
 drivers/net/mlx5/mlx5.c      | 12 ++++++++++++
 drivers/net/mlx5/mlx5.h      |  3 +++
 drivers/net/mlx5/mlx5_rx.h   | 24 ++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxq.c  |  4 ++++
 meson_options.txt            |  2 ++
 6 files changed, 46 insertions(+)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index dac7f1fabf..0d2888742c 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -61,6 +61,7 @@ foreach option:cflags_options
         cflags += option
     endif
 endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
 if get_option('buildtype').contains('debug')
     cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f84e061fe7..cf57867c25 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -161,6 +161,11 @@
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1991,6 +1996,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->decap_en = !!tmp;
 	} else if (strcmp(MLX5_ALLOW_DUPLICATE_PATTERN, key) == 0) {
 		config->allow_duplicate_pattern = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -2051,6 +2060,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
 		MLX5_ALLOW_DUPLICATE_PATTERN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e02714e231..7d5617f5ca 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -298,6 +298,9 @@ struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index 3f2b99fb65..19318bdd1b 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -148,6 +148,9 @@ struct mlx5_rxq_data {
 	uint32_t rxseg_n; /* Number of split segment descriptions. */
 	struct mlx5_eth_rxseg rxseg[MLX5_MAX_RXQ_NSEG];
 	/* Buffer split segment descriptions - sizes, offsets, pools. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
@@ -422,6 +425,15 @@ mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
 	const uint32_t offset = strd_idx * strd_sz + strd_shift;
 	void *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	if (unlikely(!rxq->mprq_tstore_memcpy) &&
+	    len <= rxq->mprq_max_memcpy_len) {
+		rte_prefetch1(addr);
+		if (len > RTE_CACHE_LINE_SIZE)
+			rte_prefetch2((void *)((uintptr_t)addr +
+					       RTE_CACHE_LINE_SIZE));
+	}
+#endif
 	/*
 	 * Memcpy packets to the target mbuf if:
 	 * - The size of packet is smaller than mprq_max_memcpy_len.
@@ -433,8 +445,20 @@ mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
 	    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 		if (likely(len <=
 			   (uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) {
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+			uintptr_t data_addr;
+
+			data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+			if (!((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK) &&
+				rxq->mprq_tstore_memcpy)
+				rte_memcpy_aligned_tstore16((void *)data_addr,
+					   addr, len);
+			else
+				rte_memcpy((void *)data_addr, addr, len);
+#else
 			rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
 				   addr, len);
+#endif
 			DATA_LEN(pkt) = len;
 		} else if (rxq->strd_scatter_en) {
 			struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index abd8ce7989..a1b0fa6455 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1449,6 +1449,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
+
 	/*
 	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
 	 * following conditions are met:
diff --git a/meson_options.txt b/meson_options.txt
index e232c9c340..cc7b629d17 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -38,6 +38,8 @@ option('max_lcores', type: 'integer', value: 128, description:
        'maximum number of cores/threads supported by EAL')
 option('max_numa_nodes', type: 'integer', value: 32, description:
        'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false, description:
+       'to enable optimized MPRQ in RX datapath')
 option('platform', type: 'string', value: 'native', description:
        'Platform to build, either "native", "generic" or a SoC. Please refer to the Linux build guide for more information.')
 option('enable_trace_fp', type: 'boolean', value: false, description:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-08-23  8:44 [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Aman Kumar
  2021-08-23  8:44 ` [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms Aman Kumar
@ 2021-08-23 15:21 ` Jerin Jacob
  2021-08-30  9:39   ` Aman Kumar
  2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
  2 siblings, 1 reply; 43+ messages in thread
From: Jerin Jacob @ 2021-08-23 15:21 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dpdk-dev, Raslan Darawsheh, Asaf Penso, shys,
	Viacheslav Ovsiienko, Alexander Kozyrev, Matan Azrad,
	Anatoly Burakov, keesang.song

On Mon, Aug 23, 2021 at 2:14 PM Aman Kumar <aman.kumar@vvdntech.in> wrote:
>
> This patch provides rte_memcpy* calls optimized for
> AMD EPYC Gen2 platforms. This option is disabled by
> default and can be enabled by defining 'rte_memcpy_amdepyc2'

Generic options should support all the architectures.
Another more scalable alternative is to introduce config/x86/x86_amd_epyc2
and new parameter under [properties]


> in the meson build.
>
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> ---
>  lib/eal/x86/include/meson.build  |   1 +
>  lib/eal/x86/include/rte_memcpy.h | 502 +++++++++++++++++++++++++++++++
>  meson_options.txt                |   2 +
>  3 files changed, 505 insertions(+)
>
> diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
> index 12c2e00035..a03683779d 100644
> --- a/lib/eal/x86/include/meson.build
> +++ b/lib/eal/x86/include/meson.build
> @@ -27,3 +27,4 @@ arch_indirect_headers = files(
>  )
>  install_headers(arch_headers + arch_indirect_headers, subdir: get_option('include_subdir_arch'))
>  dpdk_chkinc_headers += arch_headers
> +dpdk_conf.set('RTE_MEMCPY_AMDEPYC2', get_option('rte_memcpy_amdepyc2'))
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 79f381dd9b..47dda9cb87 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -368,6 +368,498 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>         }
>  }
>
> +#if defined RTE_MEMCPY_AMDEPYC2
> +
> +/**
> + * Copy 16 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy16_ts(uint8_t *dst, uint8_t *src)
> +{
> +       __m128i var128;
> +
> +       var128 = _mm_stream_load_si128((__m128i *)src);
> +       _mm_storeu_si128((__m128i *)dst, var128);
> +}
> +
> +/**
> + * Copy 32 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy32_ts(uint8_t *dst, uint8_t *src)
> +{
> +       __m256i ymm0;
> +
> +       ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +       _mm256_storeu_si256((__m256i *)dst, ymm0);
> +}
> +
> +/**
> + * Copy 64 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy64_ts(uint8_t *dst, uint8_t *src)
> +{
> +       rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
> +       rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
> +}
> +
> +/**
> + * Copy 128 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy128_ts(uint8_t *dst, uint8_t *src)
> +{
> +       rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
> +       rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
> +       rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
> +       rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
> +}
> +
> +/**
> + * Copy len bytes from one location to another,
> + * with temporal stores 16B aligned
> + */
> +static __rte_always_inline void *
> +rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
> +{
> +       void *dest = dst;
> +
> +       while (len >= 128) {
> +               rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
> +               dst = (uint8_t *)dst + 128;
> +               src = (uint8_t *)src + 128;
> +               len -= 128;
> +       }
> +       while (len >= 64) {
> +               rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
> +               dst = (uint8_t *)dst + 64;
> +               src = (uint8_t *)src + 64;
> +               len -= 64;
> +       }
> +       while (len >= 32) {
> +               rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
> +               dst = (uint8_t *)dst + 32;
> +               src = (uint8_t *)src + 32;
> +               len -= 32;
> +       }
> +       if (len >= 16) {
> +               rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
> +               dst = (uint8_t *)dst + 16;
> +               src = (uint8_t *)src + 16;
> +               len -= 16;
> +       }
> +       if (len >= 8) {
> +               *(uint64_t *)dst = *(const uint64_t *)src;
> +               dst = (uint8_t *)dst + 8;
> +               src = (uint8_t *)src + 8;
> +               len -= 8;
> +       }
> +       if (len >= 4) {
> +               *(uint32_t *)dst = *(const uint32_t *)src;
> +               dst = (uint8_t *)dst + 4;
> +               src = (uint8_t *)src + 4;
> +               len -= 4;
> +       }
> +       if (len != 0) {
> +               dst = (uint8_t *)dst - (4 - len);
> +               src = (uint8_t *)src - (4 - len);
> +               *(uint32_t *)dst = *(const uint32_t *)src;
> +       }
> +
> +       return dest;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +                                           const void *src,
> +                                           size_t size)
> +{
> +       asm volatile goto("movq %0, %%rsi\n\t"
> +       "movq %1, %%rdi\n\t"
> +       "movq %2, %%rdx\n\t"
> +       "cmpq   $(128), %%rdx\n\t"
> +       "jb     202f\n\t"
> +       "201:\n\t"
> +       "vmovntdqa (%%rsi), %%ymm0\n\t"
> +       "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +       "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +       "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +       "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +       "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +       "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +       "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +       "addq   $128, %%rsi\n\t"
> +       "addq   $128, %%rdi\n\t"
> +       "subq   $128, %%rdx\n\t"
> +       "jz     %l[done]\n\t"
> +       "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +       "jae    201b\n\t"
> +       "202:\n\t"
> +       "cmpq   $64, %%rdx\n\t"
> +       "jb     203f\n\t"
> +       "vmovntdqa (%%rsi), %%ymm0\n\t"
> +       "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +       "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +       "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +       "addq   $64, %%rsi\n\t"
> +       "addq   $64, %%rdi\n\t"
> +       "subq   $64, %%rdx\n\t"
> +       "jz     %l[done]\n\t"
> +       "203:\n\t"
> +       "cmpq   $32, %%rdx\n\t"
> +       "jb     204f\n\t"
> +       "vmovntdqa (%%rsi), %%ymm0\n\t"
> +       "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +       "addq   $32, %%rsi\n\t"
> +       "addq   $32, %%rdi\n\t"
> +       "subq   $32, %%rdx\n\t"
> +       "jz     %l[done]\n\t"
> +       "204:\n\t"
> +       "cmpb   $16, %%dl\n\t"
> +       "jb     205f\n\t"
> +       "vmovntdqa (%%rsi), %%xmm0\n\t"
> +       "vmovdqu  %%xmm0, (%%rdi)\n\t"
> +       "addq   $16, %%rsi\n\t"
> +       "addq   $16, %%rdi\n\t"
> +       "subq   $16, %%rdx\n\t"
> +       "jz     %l[done]\n\t"
> +       "205:\n\t"
> +       "cmpb   $2, %%dl\n\t"
> +       "jb     208f\n\t"
> +       "cmpb   $4, %%dl\n\t"
> +       "jbe    207f\n\t"
> +       "cmpb   $8, %%dl\n\t"
> +       "jbe    206f\n\t"
> +       "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +       "movq   (%%rsi), %%rsi\n\t"
> +       "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +       "movq   %%rsi, (%%rdi)\n\t"
> +       "jmp    %l[done]\n\t"
> +       "206:\n\t"
> +       "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +       "movl   (%%rsi), %%esi\n\t"
> +       "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +       "movl   %%esi, (%%rdi)\n\t"
> +       "jmp    %l[done]\n\t"
> +       "207:\n\t"
> +       "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +       "movzwl (%%rsi), %%esi\n\t"
> +       "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +       "movw   %%si, (%%rdi)\n\t"
> +       "jmp    %l[done]\n\t"
> +       "208:\n\t"
> +       "movzbl (%%rsi), %%ecx\n\t"
> +       "movb   %%cl, (%%rdi)"
> +       :
> +       : "r"(src), "r"(dst), "r"(size)
> +       : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +       : done
> +       );
> +done:
> +       return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len)
> +{
> +       asm goto("movq  %0, %%rsi\n\t"
> +       "movq   %1, %%rdi\n\t"
> +       "movq   %2, %%rdx\n\t"
> +       "movq    %%rdi, %%rax\n\t"
> +       "cmp     $32, %%rdx\n\t"
> +       "jb      101f\n\t"
> +       "cmp     $(32 * 2), %%rdx\n\t"
> +       "ja      108f\n\t"
> +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +       "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +       "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]\n\t"
> +       "101:\n\t"
> +       /* Less than 1 VEC.  */
> +       "cmpb    $32, %%dl\n\t"
> +       "jae     103f\n\t"
> +       "cmpb    $16, %%dl\n\t"
> +       "jae     104f\n\t"
> +       "cmpb    $8, %%dl\n\t"
> +       "jae     105f\n\t"
> +       "cmpb    $4, %%dl\n\t"
> +       "jae     106f\n\t"
> +       "cmpb    $1, %%dl\n\t"
> +       "ja      107f\n\t"
> +       "jb      102f\n\t"
> +       "movzbl  (%%rsi), %%ecx\n\t"
> +       "movb    %%cl, (%%rdi)\n\t"
> +       "102:\n\t"
> +       "jmp %l[done]\n\t"
> +       "103:\n\t"
> +       /* From 32 to 63.  No branch when size == 32.  */
> +       "vmovdqu (%%rsi), %%ymm0\n\t"
> +       "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +       "vmovdqu %%ymm0, (%%rdi)\n\t"
> +       "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]\n\t"
> +       /* From 16 to 31.  No branch when size == 16.  */
> +       "104:\n\t"
> +       "vmovdqu (%%rsi), %%xmm0\n\t"
> +       "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +       "vmovdqu %%xmm0, (%%rdi)\n\t"
> +       "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +       "jmp %l[done]\n\t"
> +       "105:\n\t"
> +       /* From 8 to 15.  No branch when size == 8.  */
> +       "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +       "movq    (%%rsi), %%rsi\n\t"
> +       "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +       "movq    %%rsi, (%%rdi)\n\t"
> +       "jmp %l[done]\n\t"
> +       "106:\n\t"
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +       "movl    (%%rsi), %%esi\n\t"
> +       "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +       "movl    %%esi, (%%rdi)\n\t"
> +       "jmp %l[done]\n\t"
> +       "107:\n\t"
> +       /* From 2 to 3.  No branch when size == 2.  */
> +       "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +       "movzwl  (%%rsi), %%esi\n\t"
> +       "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +       "movw    %%si, (%%rdi)\n\t"
> +       "jmp %l[done]\n\t"
> +       "108:\n\t"
> +       /* More than 2 * VEC and there may be overlap between destination */
> +       /* and source.  */
> +       "cmpq    $(32 * 8), %%rdx\n\t"
> +       "ja      111f\n\t"
> +       "cmpq    $(32 * 4), %%rdx\n\t"
> +       "jb      109f\n\t"
> +       /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +       "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +       "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +       "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +       "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +       "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +       "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +       "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +       "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +       "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +       "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +       "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +       "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +       "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]\n\t"
> +       "109:\n\t"
> +       /* Copy from 2 * VEC to 4 * VEC. */
> +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +       "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +       "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +       "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +       "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +       "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +       "vzeroupper\n\t"
> +       "110:\n\t"
> +       "jmp %l[done]\n\t"
> +       "111:\n\t"
> +       "cmpq    %%rsi, %%rdi\n\t"
> +       "ja      113f\n\t"
> +       /* Source == destination is less common.  */
> +       "je      110b\n\t"
> +       /* Load the first VEC and last 4 * VEC to
> +        * support overlapping addresses.
> +        */
> +       "vmovdqu   (%%rsi), %%ymm4\n\t"
> +       "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +       "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +       "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +       "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +       /* Save start and stop of the destination buffer.  */
> +       "movq    %%rdi, %%r11\n\t"
> +       "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +       /* Align destination for aligned stores in the loop.  Compute */
> +       /* how much destination is misaligned.  */
> +       "movq    %%rdi, %%r8\n\t"
> +       "andq    $(32 - 1), %%r8\n\t"
> +       /* Get the negative of offset for alignment.  */
> +       "subq    $32, %%r8\n\t"
> +       /* Adjust source.  */
> +       "subq    %%r8, %%rsi\n\t"
> +       /* Adjust destination which should be aligned now.  */
> +       "subq    %%r8, %%rdi\n\t"
> +       /* Adjust length.  */
> +       "addq    %%r8, %%rdx\n\t"
> +       /* Check non-temporal store threshold.  */
> +       "cmpq    $(1024*1024), %%rdx\n\t"
> +       "ja      115f\n\t"
> +       "112:\n\t"
> +       /* Copy 4 * VEC a time forward.  */
> +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +       "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +       "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +       "addq    $(32 * 4), %%rsi\n\t"
> +       "subq    $(32 * 4), %%rdx\n\t"
> +       "vmovdqa   %%ymm0, (%%rdi)\n\t"
> +       "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +       "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +       "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +       "addq    $(32 * 4), %%rdi\n\t"
> +       "cmpq    $(32 * 4), %%rdx\n\t"
> +       "ja      112b\n\t"
> +       /* Store the last 4 * VEC.  */
> +       "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +       "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +       "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +       "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +       /* Store the first VEC.  */
> +       "vmovdqu   %%ymm4, (%%r11)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]\n\t"
> +       "113:\n\t"
> +       /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +       "vmovdqu   (%%rsi), %%ymm4\n\t"
> +       "vmovdqu   32(%%rsi), %%ymm5\n\t"
> +       "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +       "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +       /* Save stop of the destination buffer.  */
> +       "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +       /* Align destination end for aligned stores in the loop.  Compute */
> +       /* how much destination end is misaligned.  */
> +       "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +       "movq    %%r11, %%r9\n\t"
> +       "movq    %%r11, %%r8\n\t"
> +       "andq    $(32 - 1), %%r8\n\t"
> +       /* Adjust source.  */
> +       "subq    %%r8, %%rcx\n\t"
> +       /* Adjust the end of destination which should be aligned now.  */
> +       "subq    %%r8, %%r9\n\t"
> +       /* Adjust length.  */
> +       "subq    %%r8, %%rdx\n\t"
> +        /* Check non-temporal store threshold.  */
> +       "cmpq    $(1024*1024), %%rdx\n\t"
> +       "ja      117f\n\t"
> +       "114:\n\t"
> +       /* Copy 4 * VEC a time backward.  */
> +       "vmovdqu   (%%rcx), %%ymm0\n\t"
> +       "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +       "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +       "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +       "subq    $(32 * 4), %%rcx\n\t"
> +       "subq    $(32 * 4), %%rdx\n\t"
> +       "vmovdqa   %%ymm0, (%%r9)\n\t"
> +       "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +       "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +       "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +       "subq    $(32 * 4), %%r9\n\t"
> +       "cmpq    $(32 * 4), %%rdx\n\t"
> +       "ja      114b\n\t"
> +       /* Store the first 4 * VEC. */
> +       "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +       "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +       "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +       "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +       /* Store the last VEC. */
> +       "vmovdqu   %%ymm8, (%%r11)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]\n\t"
> +
> +       "115:\n\t"
> +       /* Don't use non-temporal store if there is overlap between */
> +       /* destination and source since destination may be in cache */
> +       /* when source is loaded. */
> +       "leaq    (%%rdi, %%rdx), %%r10\n\t"
> +       "cmpq    %%r10, %%rsi\n\t"
> +       "jb      112b\n\t"
> +       "116:\n\t"
> +       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> +       "prefetcht0 (32*4*2)(%%rsi)\n\t"
> +       "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +       "prefetcht0 (32*4*3)(%%rsi)\n\t"
> +       "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +       "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +       "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +       "addq    $(32*4), %%rsi\n\t"
> +       "subq    $(32*4), %%rdx\n\t"
> +       "vmovntdq  %%ymm0, (%%rdi)\n\t"
> +       "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +       "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +       "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +       "addq    $(32*4), %%rdi\n\t"
> +       "cmpq    $(32*4), %%rdx\n\t"
> +       "ja      116b\n\t"
> +       "sfence\n\t"
> +       /* Store the last 4 * VEC.  */
> +       "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +       "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +       "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +       "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +       /* Store the first VEC.  */
> +       "vmovdqu   %%ymm4, (%%r11)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]\n\t"
> +       "117:\n\t"
> +       /* Don't use non-temporal store if there is overlap between */
> +       /* destination and source since destination may be in cache */
> +       /* when source is loaded.  */
> +       "leaq    (%%rcx, %%rdx), %%r10\n\t"
> +       "cmpq    %%r10, %%r9\n\t"
> +       "jb      114b\n\t"
> +       "118:\n\t"
> +       /* Copy 4 * VEC a time backward with non-temporal stores. */
> +       "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +       "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +       "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +       "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +       "vmovdqu   (%%rcx), %%ymm0\n\t"
> +       "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +       "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +       "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +       "subq    $(32*4), %%rcx\n\t"
> +       "subq    $(32*4), %%rdx\n\t"
> +       "vmovntdq  %%ymm0, (%%r9)\n\t"
> +       "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +       "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +       "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +       "subq    $(32 * 4), %%r9\n\t"
> +       "cmpq    $(32 * 4), %%rdx\n\t"
> +       "ja      118b\n\t"
> +       "sfence\n\t"
> +       /* Store the first 4 * VEC.  */
> +       "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +       "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +       "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +       "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +       /* Store the last VEC.  */
> +       "vmovdqu   %%ymm8, (%%r11)\n\t"
> +       "vzeroupper\n\t"
> +       "jmp %l[done]"
> +       :
> +       : "r"(src), "r"(dst), "r"(len)
> +       : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> +       "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +       : done
> +       );
> +done:
> +       return dst;
> +}
> +
> +#else
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> @@ -479,6 +971,8 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>         goto COPY_BLOCK_128_BACK31;
>  }
>
> +#endif /* RTE_MEMCPY_AMDEPYC2 */
> +
>  #else /* __AVX512F__ */
>
>  #define ALIGNMENT_MASK 0x0F
> @@ -874,6 +1368,14 @@ rte_memcpy(void *dst, const void *src, size_t n)
>                 return rte_memcpy_generic(dst, src, n);
>  }
>
> +#if defined __AVX2__ && defined(RTE_MEMCPY_AMDEPYC2)
> +static __rte_always_inline void *
> +rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
> +{
> +       return rte_memcpy_aligned_ntload_tstore16_amdepyc2(dst, src, len);
> +}
> +#endif
> +
>  #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
>  #pragma GCC diagnostic pop
>  #endif
> diff --git a/meson_options.txt b/meson_options.txt
> index 0e92734c49..e232c9c340 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -42,6 +42,8 @@ option('platform', type: 'string', value: 'native', description:
>         'Platform to build, either "native", "generic" or a SoC. Please refer to the Linux build guide for more information.')
>  option('enable_trace_fp', type: 'boolean', value: false, description:
>         'enable fast path trace points.')
> +option('rte_memcpy_amdepyc2', type: 'boolean', value: false, description:
> +       'to enable amd epyc memcpy routines')
>  option('tests', type: 'boolean', value: true, description:
>         'build unit tests')
>  option('use_hpet', type: 'boolean', value: false, description:
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-08-23 15:21 ` [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Jerin Jacob
@ 2021-08-30  9:39   ` Aman Kumar
  0 siblings, 0 replies; 43+ messages in thread
From: Aman Kumar @ 2021-08-30  9:39 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dpdk-dev, Raslan Darawsheh, Asaf Penso, Shy Shyman,
	Viacheslav Ovsiienko, Alexander Kozyrev, Matan Azrad,
	Anatoly Burakov, Song, Keesang

Hi Jerin,
Thanks for your comments and suggestions.
I will try to update this in V2.

On Mon, Aug 23, 2021 at 8:51 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Mon, Aug 23, 2021 at 2:14 PM Aman Kumar <aman.kumar@vvdntech.in> wrote:
> >
> > This patch provides rte_memcpy* calls optimized for
> > AMD EPYC Gen2 platforms. This option is disabled by
> > default and can be enabled by defining 'rte_memcpy_amdepyc2'
>
> Generic options should support all the architectures.
> Another more scalable alternative is to introduce config/x86/x86_amd_epyc2
> and new parameter under [properties]
>
>
> > in the meson build.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> > ---
> >  lib/eal/x86/include/meson.build  |   1 +
> >  lib/eal/x86/include/rte_memcpy.h | 502 +++++++++++++++++++++++++++++++
> >  meson_options.txt                |   2 +
> >  3 files changed, 505 insertions(+)
> >
> > diff --git a/lib/eal/x86/include/meson.build
> b/lib/eal/x86/include/meson.build
> > index 12c2e00035..a03683779d 100644
> > --- a/lib/eal/x86/include/meson.build
> > +++ b/lib/eal/x86/include/meson.build
> > @@ -27,3 +27,4 @@ arch_indirect_headers = files(
> >  )
> >  install_headers(arch_headers + arch_indirect_headers, subdir:
> get_option('include_subdir_arch'))
> >  dpdk_chkinc_headers += arch_headers
> > +dpdk_conf.set('RTE_MEMCPY_AMDEPYC2', get_option('rte_memcpy_amdepyc2'))
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> > index 79f381dd9b..47dda9cb87 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -368,6 +368,498 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src,
> size_t n)
> >         }
> >  }
> >
> > +#if defined RTE_MEMCPY_AMDEPYC2
> > +
> > +/**
> > + * Copy 16 bytes from one location to another,
> > + * with temporal stores
> > + */
> > +static __rte_always_inline void
> > +rte_copy16_ts(uint8_t *dst, uint8_t *src)
> > +{
> > +       __m128i var128;
> > +
> > +       var128 = _mm_stream_load_si128((__m128i *)src);
> > +       _mm_storeu_si128((__m128i *)dst, var128);
> > +}
> > +
> > +/**
> > + * Copy 32 bytes from one location to another,
> > + * with temporal stores
> > + */
> > +static __rte_always_inline void
> > +rte_copy32_ts(uint8_t *dst, uint8_t *src)
> > +{
> > +       __m256i ymm0;
> > +
> > +       ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +       _mm256_storeu_si256((__m256i *)dst, ymm0);
> > +}
> > +
> > +/**
> > + * Copy 64 bytes from one location to another,
> > + * with temporal stores
> > + */
> > +static __rte_always_inline void
> > +rte_copy64_ts(uint8_t *dst, uint8_t *src)
> > +{
> > +       rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
> > +       rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
> > +}
> > +
> > +/**
> > + * Copy 128 bytes from one location to another,
> > + * with temporal stores
> > + */
> > +static __rte_always_inline void
> > +rte_copy128_ts(uint8_t *dst, uint8_t *src)
> > +{
> > +       rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
> > +       rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
> > +       rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
> > +       rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
> > +}
> > +
> > +/**
> > + * Copy len bytes from one location to another,
> > + * with temporal stores 16B aligned
> > + */
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
> > +{
> > +       void *dest = dst;
> > +
> > +       while (len >= 128) {
> > +               rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
> > +               dst = (uint8_t *)dst + 128;
> > +               src = (uint8_t *)src + 128;
> > +               len -= 128;
> > +       }
> > +       while (len >= 64) {
> > +               rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
> > +               dst = (uint8_t *)dst + 64;
> > +               src = (uint8_t *)src + 64;
> > +               len -= 64;
> > +       }
> > +       while (len >= 32) {
> > +               rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
> > +               dst = (uint8_t *)dst + 32;
> > +               src = (uint8_t *)src + 32;
> > +               len -= 32;
> > +       }
> > +       if (len >= 16) {
> > +               rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
> > +               dst = (uint8_t *)dst + 16;
> > +               src = (uint8_t *)src + 16;
> > +               len -= 16;
> > +       }
> > +       if (len >= 8) {
> > +               *(uint64_t *)dst = *(const uint64_t *)src;
> > +               dst = (uint8_t *)dst + 8;
> > +               src = (uint8_t *)src + 8;
> > +               len -= 8;
> > +       }
> > +       if (len >= 4) {
> > +               *(uint32_t *)dst = *(const uint32_t *)src;
> > +               dst = (uint8_t *)dst + 4;
> > +               src = (uint8_t *)src + 4;
> > +               len -= 4;
> > +       }
> > +       if (len != 0) {
> > +               dst = (uint8_t *)dst - (4 - len);
> > +               src = (uint8_t *)src - (4 - len);
> > +               *(uint32_t *)dst = *(const uint32_t *)src;
> > +       }
> > +
> > +       return dest;
> > +}
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > +                                           const void *src,
> > +                                           size_t size)
> > +{
> > +       asm volatile goto("movq %0, %%rsi\n\t"
> > +       "movq %1, %%rdi\n\t"
> > +       "movq %2, %%rdx\n\t"
> > +       "cmpq   $(128), %%rdx\n\t"
> > +       "jb     202f\n\t"
> > +       "201:\n\t"
> > +       "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +       "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +       "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > +       "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > +       "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +       "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > +       "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > +       "addq   $128, %%rsi\n\t"
> > +       "addq   $128, %%rdi\n\t"
> > +       "subq   $128, %%rdx\n\t"
> > +       "jz     %l[done]\n\t"
> > +       "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > +       "jae    201b\n\t"
> > +       "202:\n\t"
> > +       "cmpq   $64, %%rdx\n\t"
> > +       "jb     203f\n\t"
> > +       "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +       "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +       "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +       "addq   $64, %%rsi\n\t"
> > +       "addq   $64, %%rdi\n\t"
> > +       "subq   $64, %%rdx\n\t"
> > +       "jz     %l[done]\n\t"
> > +       "203:\n\t"
> > +       "cmpq   $32, %%rdx\n\t"
> > +       "jb     204f\n\t"
> > +       "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +       "addq   $32, %%rsi\n\t"
> > +       "addq   $32, %%rdi\n\t"
> > +       "subq   $32, %%rdx\n\t"
> > +       "jz     %l[done]\n\t"
> > +       "204:\n\t"
> > +       "cmpb   $16, %%dl\n\t"
> > +       "jb     205f\n\t"
> > +       "vmovntdqa (%%rsi), %%xmm0\n\t"
> > +       "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > +       "addq   $16, %%rsi\n\t"
> > +       "addq   $16, %%rdi\n\t"
> > +       "subq   $16, %%rdx\n\t"
> > +       "jz     %l[done]\n\t"
> > +       "205:\n\t"
> > +       "cmpb   $2, %%dl\n\t"
> > +       "jb     208f\n\t"
> > +       "cmpb   $4, %%dl\n\t"
> > +       "jbe    207f\n\t"
> > +       "cmpb   $8, %%dl\n\t"
> > +       "jbe    206f\n\t"
> > +       "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > +       "movq   (%%rsi), %%rsi\n\t"
> > +       "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +       "movq   %%rsi, (%%rdi)\n\t"
> > +       "jmp    %l[done]\n\t"
> > +       "206:\n\t"
> > +       "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > +       "movl   (%%rsi), %%esi\n\t"
> > +       "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +       "movl   %%esi, (%%rdi)\n\t"
> > +       "jmp    %l[done]\n\t"
> > +       "207:\n\t"
> > +       "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > +       "movzwl (%%rsi), %%esi\n\t"
> > +       "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > +       "movw   %%si, (%%rdi)\n\t"
> > +       "jmp    %l[done]\n\t"
> > +       "208:\n\t"
> > +       "movzbl (%%rsi), %%ecx\n\t"
> > +       "movb   %%cl, (%%rdi)"
> > +       :
> > +       : "r"(src), "r"(dst), "r"(size)
> > +       : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3",
> "memory"
> > +       : done
> > +       );
> > +done:
> > +       return dst;
> > +}
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy_generic(void *dst, const void *src, size_t len)
> > +{
> > +       asm goto("movq  %0, %%rsi\n\t"
> > +       "movq   %1, %%rdi\n\t"
> > +       "movq   %2, %%rdx\n\t"
> > +       "movq    %%rdi, %%rax\n\t"
> > +       "cmp     $32, %%rdx\n\t"
> > +       "jb      101f\n\t"
> > +       "cmp     $(32 * 2), %%rdx\n\t"
> > +       "ja      108f\n\t"
> > +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +       "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "101:\n\t"
> > +       /* Less than 1 VEC.  */
> > +       "cmpb    $32, %%dl\n\t"
> > +       "jae     103f\n\t"
> > +       "cmpb    $16, %%dl\n\t"
> > +       "jae     104f\n\t"
> > +       "cmpb    $8, %%dl\n\t"
> > +       "jae     105f\n\t"
> > +       "cmpb    $4, %%dl\n\t"
> > +       "jae     106f\n\t"
> > +       "cmpb    $1, %%dl\n\t"
> > +       "ja      107f\n\t"
> > +       "jb      102f\n\t"
> > +       "movzbl  (%%rsi), %%ecx\n\t"
> > +       "movb    %%cl, (%%rdi)\n\t"
> > +       "102:\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "103:\n\t"
> > +       /* From 32 to 63.  No branch when size == 32.  */
> > +       "vmovdqu (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +       "vmovdqu %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]\n\t"
> > +       /* From 16 to 31.  No branch when size == 16.  */
> > +       "104:\n\t"
> > +       "vmovdqu (%%rsi), %%xmm0\n\t"
> > +       "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > +       "vmovdqu %%xmm0, (%%rdi)\n\t"
> > +       "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "105:\n\t"
> > +       /* From 8 to 15.  No branch when size == 8.  */
> > +       "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > +       "movq    (%%rsi), %%rsi\n\t"
> > +       "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +       "movq    %%rsi, (%%rdi)\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "106:\n\t"
> > +       /* From 4 to 7.  No branch when size == 4.  */
> > +       "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > +       "movl    (%%rsi), %%esi\n\t"
> > +       "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +       "movl    %%esi, (%%rdi)\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "107:\n\t"
> > +       /* From 2 to 3.  No branch when size == 2.  */
> > +       "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > +       "movzwl  (%%rsi), %%esi\n\t"
> > +       "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > +       "movw    %%si, (%%rdi)\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "108:\n\t"
> > +       /* More than 2 * VEC and there may be overlap between
> destination */
> > +       /* and source.  */
> > +       "cmpq    $(32 * 8), %%rdx\n\t"
> > +       "ja      111f\n\t"
> > +       "cmpq    $(32 * 4), %%rdx\n\t"
> > +       "jb      109f\n\t"
> > +       /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +       "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +       "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > +       "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > +       "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > +       "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > +       "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > +       "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +       "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > +       "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "109:\n\t"
> > +       /* Copy from 2 * VEC to 4 * VEC. */
> > +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > +       "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > +       "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > +       "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +       "vzeroupper\n\t"
> > +       "110:\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "111:\n\t"
> > +       "cmpq    %%rsi, %%rdi\n\t"
> > +       "ja      113f\n\t"
> > +       /* Source == destination is less common.  */
> > +       "je      110b\n\t"
> > +       /* Load the first VEC and last 4 * VEC to
> > +        * support overlapping addresses.
> > +        */
> > +       "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +       "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > +       "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > +       "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > +       "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > +       /* Save start and stop of the destination buffer.  */
> > +       "movq    %%rdi, %%r11\n\t"
> > +       "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > +       /* Align destination for aligned stores in the loop.  Compute */
> > +       /* how much destination is misaligned.  */
> > +       "movq    %%rdi, %%r8\n\t"
> > +       "andq    $(32 - 1), %%r8\n\t"
> > +       /* Get the negative of offset for alignment.  */
> > +       "subq    $32, %%r8\n\t"
> > +       /* Adjust source.  */
> > +       "subq    %%r8, %%rsi\n\t"
> > +       /* Adjust destination which should be aligned now.  */
> > +       "subq    %%r8, %%rdi\n\t"
> > +       /* Adjust length.  */
> > +       "addq    %%r8, %%rdx\n\t"
> > +       /* Check non-temporal store threshold.  */
> > +       "cmpq    $(1024*1024), %%rdx\n\t"
> > +       "ja      115f\n\t"
> > +       "112:\n\t"
> > +       /* Copy 4 * VEC a time forward.  */
> > +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +       "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +       "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +       "addq    $(32 * 4), %%rsi\n\t"
> > +       "subq    $(32 * 4), %%rdx\n\t"
> > +       "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > +       "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > +       "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +       "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +       "addq    $(32 * 4), %%rdi\n\t"
> > +       "cmpq    $(32 * 4), %%rdx\n\t"
> > +       "ja      112b\n\t"
> > +       /* Store the last 4 * VEC.  */
> > +       "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +       "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +       "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +       "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +       /* Store the first VEC.  */
> > +       "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "113:\n\t"
> > +       /* Load the first 4*VEC and last VEC to support overlapping
> addresses.*/
> > +       "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +       "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > +       "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > +       "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > +       "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > +       /* Save stop of the destination buffer.  */
> > +       "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > +       /* Align destination end for aligned stores in the loop.
> Compute */
> > +       /* how much destination end is misaligned.  */
> > +       "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > +       "movq    %%r11, %%r9\n\t"
> > +       "movq    %%r11, %%r8\n\t"
> > +       "andq    $(32 - 1), %%r8\n\t"
> > +       /* Adjust source.  */
> > +       "subq    %%r8, %%rcx\n\t"
> > +       /* Adjust the end of destination which should be aligned now.  */
> > +       "subq    %%r8, %%r9\n\t"
> > +       /* Adjust length.  */
> > +       "subq    %%r8, %%rdx\n\t"
> > +        /* Check non-temporal store threshold.  */
> > +       "cmpq    $(1024*1024), %%rdx\n\t"
> > +       "ja      117f\n\t"
> > +       "114:\n\t"
> > +       /* Copy 4 * VEC a time backward.  */
> > +       "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +       "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +       "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +       "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +       "subq    $(32 * 4), %%rcx\n\t"
> > +       "subq    $(32 * 4), %%rdx\n\t"
> > +       "vmovdqa   %%ymm0, (%%r9)\n\t"
> > +       "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > +       "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +       "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +       "subq    $(32 * 4), %%r9\n\t"
> > +       "cmpq    $(32 * 4), %%rdx\n\t"
> > +       "ja      114b\n\t"
> > +       /* Store the first 4 * VEC. */
> > +       "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +       "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +       /* Store the last VEC. */
> > +       "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]\n\t"
> > +
> > +       "115:\n\t"
> > +       /* Don't use non-temporal store if there is overlap between */
> > +       /* destination and source since destination may be in cache */
> > +       /* when source is loaded. */
> > +       "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > +       "cmpq    %%r10, %%rsi\n\t"
> > +       "jb      112b\n\t"
> > +       "116:\n\t"
> > +       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +       "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > +       "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > +       "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > +       "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > +       "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +       "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +       "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +       "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +       "addq    $(32*4), %%rsi\n\t"
> > +       "subq    $(32*4), %%rdx\n\t"
> > +       "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > +       "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > +       "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +       "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +       "addq    $(32*4), %%rdi\n\t"
> > +       "cmpq    $(32*4), %%rdx\n\t"
> > +       "ja      116b\n\t"
> > +       "sfence\n\t"
> > +       /* Store the last 4 * VEC.  */
> > +       "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +       "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +       "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +       "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +       /* Store the first VEC.  */
> > +       "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]\n\t"
> > +       "117:\n\t"
> > +       /* Don't use non-temporal store if there is overlap between */
> > +       /* destination and source since destination may be in cache */
> > +       /* when source is loaded.  */
> > +       "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > +       "cmpq    %%r10, %%r9\n\t"
> > +       "jb      114b\n\t"
> > +       "118:\n\t"
> > +       /* Copy 4 * VEC a time backward with non-temporal stores. */
> > +       "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > +       "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > +       "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > +       "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > +       "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +       "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +       "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +       "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +       "subq    $(32*4), %%rcx\n\t"
> > +       "subq    $(32*4), %%rdx\n\t"
> > +       "vmovntdq  %%ymm0, (%%r9)\n\t"
> > +       "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > +       "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +       "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +       "subq    $(32 * 4), %%r9\n\t"
> > +       "cmpq    $(32 * 4), %%rdx\n\t"
> > +       "ja      118b\n\t"
> > +       "sfence\n\t"
> > +       /* Store the first 4 * VEC.  */
> > +       "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +       "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +       "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +       /* Store the last VEC.  */
> > +       "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +       "vzeroupper\n\t"
> > +       "jmp %l[done]"
> > +       :
> > +       : "r"(src), "r"(dst), "r"(len)
> > +       : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11",
> "r12", "ymm0",
> > +       "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8",
> "memory"
> > +       : done
> > +       );
> > +done:
> > +       return dst;
> > +}
> > +
> > +#else
> >  static __rte_always_inline void *
> >  rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  {
> > @@ -479,6 +971,8 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
> >         goto COPY_BLOCK_128_BACK31;
> >  }
> >
> > +#endif /* RTE_MEMCPY_AMDEPYC2 */
> > +
> >  #else /* __AVX512F__ */
> >
> >  #define ALIGNMENT_MASK 0x0F
> > @@ -874,6 +1368,14 @@ rte_memcpy(void *dst, const void *src, size_t n)
> >                 return rte_memcpy_generic(dst, src, n);
> >  }
> >
> > +#if defined __AVX2__ && defined(RTE_MEMCPY_AMDEPYC2)
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
> > +{
> > +       return rte_memcpy_aligned_ntload_tstore16_amdepyc2(dst, src,
> len);
> > +}
> > +#endif
> > +
> >  #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
> >  #pragma GCC diagnostic pop
> >  #endif
> > diff --git a/meson_options.txt b/meson_options.txt
> > index 0e92734c49..e232c9c340 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -42,6 +42,8 @@ option('platform', type: 'string', value: 'native',
> description:
> >         'Platform to build, either "native", "generic" or a SoC. Please
> refer to the Linux build guide for more information.')
> >  option('enable_trace_fp', type: 'boolean', value: false, description:
> >         'enable fast path trace points.')
> > +option('rte_memcpy_amdepyc2', type: 'boolean', value: false,
> description:
> > +       'to enable amd epyc memcpy routines')
> >  option('tests', type: 'boolean', value: true, description:
> >         'build unit tests')
> >  option('use_hpet', type: 'boolean', value: false, description:
> > --
> > 2.25.1
> >
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms
  2021-08-23  8:44 ` [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms Aman Kumar
@ 2021-10-13 16:53   ` Thomas Monjalon
  2021-10-19 10:52     ` Aman Kumar
  0 siblings, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-13 16:53 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, keesang.song

23/08/2021 10:44, Aman Kumar:
> add non temporal load and temporal store for mprq memcpy.
> define mlx5_ntload_tstore in meson build configuration to
> enable this optimization. This utilizes AMD EPYC2 optimized
> rte_memcpy* routines.
[...]
> +option('mlx5_ntload_tstore', type: 'boolean', value: false, description:
> +       'to enable optimized MPRQ in RX datapath')

Please don't make it a compilation option.
Why isn't it always enabled?

There was a comment on the first patch.
Do you plan to make a new version?



^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-08-23  8:44 [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Aman Kumar
  2021-08-23  8:44 ` [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms Aman Kumar
  2021-08-23 15:21 ` [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Jerin Jacob
@ 2021-10-19 10:47 ` Aman Kumar
  2021-10-19 10:47   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 plaform Aman Kumar
                     ` (3 more replies)
  2 siblings, 4 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-19 10:47 UTC (permalink / raw)
  To: dev
  Cc: rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, keesang.song, aman.kumar, jerinjacobk

This patch provides rte_memcpy* calls optimized for
AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
as cross-file with meson to build dpdk for AMD EPYC platforms.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/x86/meson.build            |   7 +
 config/x86/x86_amd_epyc_linux_gcc |  16 +
 lib/eal/x86/include/rte_memcpy.h  | 502 ++++++++++++++++++++++++++++++
 3 files changed, 525 insertions(+)
 create mode 100644 config/x86/x86_amd_epyc_linux_gcc

diff --git a/config/x86/meson.build b/config/x86/meson.build
index 29f3dea181..598b0e62ce 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -72,3 +72,10 @@ endif
 dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
 dpdk_conf.set('RTE_MAX_LCORE', 128)
 dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
+
+if meson.is_cross_build()
+	if meson.get_cross_property('platform') == 'amd-epyc'
+	    dpdk_conf.set('RTE_MAX_LCORE', 512)
+	    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
+	endif
+endif
diff --git a/config/x86/x86_amd_epyc_linux_gcc b/config/x86/x86_amd_epyc_linux_gcc
new file mode 100644
index 0000000000..0b2453135f
--- /dev/null
+++ b/config/x86/x86_amd_epyc_linux_gcc
@@ -0,0 +1,16 @@
+[binaries]
+c = 'x86_64-linux-gnu-gcc'
+cpp = 'x86_64-linux-gnu-g++'
+ld = 'x86_64-linux-gnu-ld'
+ar = 'x86_64-linux-gnu-ar'
+strip = 'x86_64-linux-gnu-strip'
+pkgconfig = 'x86_64-linux-gnu-pkg-config'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'x86_64'
+cpu = 'native'
+endian = 'little'
+
+[properties]
+platform = 'amd-epyc'
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 79f381dd9b..8f66c115e3 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -368,6 +368,498 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+#if defined RTE_MEMCPY_AMDEPYC
+
+/**
+ * Copy 16 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy16_ts(uint8_t *dst, uint8_t *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy32_ts(uint8_t *dst, uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy64_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy128_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+	rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
+	rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
+}
+
+/**
+ * Copy len bytes from one location to another,
+ * with temporal stores 16B aligned
+ */
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+
+static __rte_always_inline void *
+rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
+					    const void *src,
+					    size_t size)
+{
+	asm volatile goto("movq %0, %%rsi\n\t"
+	"movq %1, %%rdi\n\t"
+	"movq %2, %%rdx\n\t"
+	"cmpq   $(128), %%rdx\n\t"
+	"jb     202f\n\t"
+	"201:\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
+	"vmovntdqa 64(%%rsi), %%ymm2\n\t"
+	"vmovntdqa 96(%%rsi), %%ymm3\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu  %%ymm2, 64(%%rdi)\n\t"
+	"vmovdqu  %%ymm3, 96(%%rdi)\n\t"
+	"addq   $128, %%rsi\n\t"
+	"addq   $128, %%rdi\n\t"
+	"subq   $128, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
+	"jae    201b\n\t"
+	"202:\n\t"
+	"cmpq   $64, %%rdx\n\t"
+	"jb     203f\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
+	"addq   $64, %%rsi\n\t"
+	"addq   $64, %%rdi\n\t"
+	"subq   $64, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"203:\n\t"
+	"cmpq   $32, %%rdx\n\t"
+	"jb     204f\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"addq   $32, %%rsi\n\t"
+	"addq   $32, %%rdi\n\t"
+	"subq   $32, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"204:\n\t"
+	"cmpb   $16, %%dl\n\t"
+	"jb     205f\n\t"
+	"vmovntdqa (%%rsi), %%xmm0\n\t"
+	"vmovdqu  %%xmm0, (%%rdi)\n\t"
+	"addq   $16, %%rsi\n\t"
+	"addq   $16, %%rdi\n\t"
+	"subq   $16, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"205:\n\t"
+	"cmpb   $2, %%dl\n\t"
+	"jb     208f\n\t"
+	"cmpb   $4, %%dl\n\t"
+	"jbe    207f\n\t"
+	"cmpb   $8, %%dl\n\t"
+	"jbe    206f\n\t"
+	"movq   -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq   (%%rsi), %%rsi\n\t"
+	"movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq   %%rsi, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"206:\n\t"
+	"movl   -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl   (%%rsi), %%esi\n\t"
+	"movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl   %%esi, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"207:\n\t"
+	"movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl (%%rsi), %%esi\n\t"
+	"movw   %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw   %%si, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"208:\n\t"
+	"movzbl (%%rsi), %%ecx\n\t"
+	"movb   %%cl, (%%rdi)"
+	:
+	: "r"(src), "r"(dst), "r"(size)
+	: "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
+	: done
+	);
+done:
+	return dst;
+}
+
+static __rte_always_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t len)
+{
+	asm goto("movq	%0, %%rsi\n\t"
+	"movq	%1, %%rdi\n\t"
+	"movq	%2, %%rdx\n\t"
+	"movq    %%rdi, %%rax\n\t"
+	"cmp     $32, %%rdx\n\t"
+	"jb      101f\n\t"
+	"cmp     $(32 * 2), %%rdx\n\t"
+	"ja      108f\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"101:\n\t"
+	/* Less than 1 VEC.  */
+	"cmpb    $32, %%dl\n\t"
+	"jae     103f\n\t"
+	"cmpb    $16, %%dl\n\t"
+	"jae     104f\n\t"
+	"cmpb    $8, %%dl\n\t"
+	"jae     105f\n\t"
+	"cmpb    $4, %%dl\n\t"
+	"jae     106f\n\t"
+	"cmpb    $1, %%dl\n\t"
+	"ja      107f\n\t"
+	"jb      102f\n\t"
+	"movzbl  (%%rsi), %%ecx\n\t"
+	"movb    %%cl, (%%rdi)\n\t"
+	"102:\n\t"
+	"jmp %l[done]\n\t"
+	"103:\n\t"
+	/* From 32 to 63.  No branch when size == 32.  */
+	"vmovdqu (%%rsi), %%ymm0\n\t"
+	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu %%ymm0, (%%rdi)\n\t"
+	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	/* From 16 to 31.  No branch when size == 16.  */
+	"104:\n\t"
+	"vmovdqu (%%rsi), %%xmm0\n\t"
+	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
+	"vmovdqu %%xmm0, (%%rdi)\n\t"
+	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
+	"jmp %l[done]\n\t"
+	"105:\n\t"
+	/* From 8 to 15.  No branch when size == 8.  */
+	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq    (%%rsi), %%rsi\n\t"
+	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq    %%rsi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"106:\n\t"
+	/* From 4 to 7.  No branch when size == 4.  */
+	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl    (%%rsi), %%esi\n\t"
+	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl    %%esi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"107:\n\t"
+	/* From 2 to 3.  No branch when size == 2.  */
+	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl  (%%rsi), %%esi\n\t"
+	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw    %%si, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"108:\n\t"
+	/* More than 2 * VEC and there may be overlap between destination */
+	/* and source.  */
+	"cmpq    $(32 * 8), %%rdx\n\t"
+	"ja      111f\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"jb      109f\n\t"
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"109:\n\t"
+	/* Copy from 2 * VEC to 4 * VEC. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"110:\n\t"
+	"jmp %l[done]\n\t"
+	"111:\n\t"
+	"cmpq    %%rsi, %%rdi\n\t"
+	"ja      113f\n\t"
+	/* Source == destination is less common.  */
+	"je      110b\n\t"
+	/* Load the first VEC and last 4 * VEC to
+	 * support overlapping addresses.
+	 */
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
+	/* Save start and stop of the destination buffer.  */
+	"movq    %%rdi, %%r11\n\t"
+	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
+	/* Align destination for aligned stores in the loop.  Compute */
+	/* how much destination is misaligned.  */
+	"movq    %%rdi, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Get the negative of offset for alignment.  */
+	"subq    $32, %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rsi\n\t"
+	/* Adjust destination which should be aligned now.  */
+	"subq    %%r8, %%rdi\n\t"
+	/* Adjust length.  */
+	"addq    %%r8, %%rdx\n\t"
+	/* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      115f\n\t"
+	"112:\n\t"
+	/* Copy 4 * VEC a time forward.  */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32 * 4), %%rsi\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%rdi)\n\t"
+	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32 * 4), %%rdi\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      112b\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"113:\n\t"
+	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   32(%%rsi), %%ymm5\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
+	/* Save stop of the destination buffer.  */
+	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
+	/* Align destination end for aligned stores in the loop.  Compute */
+	/* how much destination end is misaligned.  */
+	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
+	"movq    %%r11, %%r9\n\t"
+	"movq    %%r11, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rcx\n\t"
+	/* Adjust the end of destination which should be aligned now.  */
+	"subq    %%r8, %%r9\n\t"
+	/* Adjust length.  */
+	"subq    %%r8, %%rdx\n\t"
+	 /* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      117f\n\t"
+	"114:\n\t"
+	/* Copy 4 * VEC a time backward.  */
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32 * 4), %%rcx\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%r9)\n\t"
+	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
+	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      114b\n\t"
+	/* Store the first 4 * VEC. */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC. */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+
+	"115:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded. */
+	"leaq    (%%rdi, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%rsi\n\t"
+	"jb      112b\n\t"
+	"116:\n\t"
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	"prefetcht0 (32*4*2)(%%rsi)\n\t"
+	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32*4), %%rsi\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%rdi)\n\t"
+	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
+	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32*4), %%rdi\n\t"
+	"cmpq    $(32*4), %%rdx\n\t"
+	"ja      116b\n\t"
+	"sfence\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"117:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded.  */
+	"leaq    (%%rcx, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%r9\n\t"
+	"jb      114b\n\t"
+	"118:\n\t"
+	/* Copy 4 * VEC a time backward with non-temporal stores. */
+	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32*4), %%rcx\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%r9)\n\t"
+	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
+	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      118b\n\t"
+	"sfence\n\t"
+	/* Store the first 4 * VEC.  */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC.  */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]"
+	:
+	: "r"(src), "r"(dst), "r"(len)
+	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
+	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
+	: done
+	);
+done:
+	return dst;
+}
+
+#else
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
@@ -479,6 +971,8 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK31;
 }
 
+#endif /* RTE_MEMCPY_AMDEPYC */
+
 #else /* __AVX512F__ */
 
 #define ALIGNMENT_MASK 0x0F
@@ -874,6 +1368,14 @@ rte_memcpy(void *dst, const void *src, size_t n)
 		return rte_memcpy_generic(dst, src, n);
 }
 
+#if defined __AVX2__ && defined(RTE_MEMCPY_AMDEPYC)
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
+{
+	return rte_memcpy_aligned_ntload_tstore16_amdepyc2(dst, src, len);
+}
+#endif
+
 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 #pragma GCC diagnostic pop
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 plaform
  2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
@ 2021-10-19 10:47   ` Aman Kumar
  2021-10-19 12:31   ` [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal Thomas Monjalon
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-19 10:47 UTC (permalink / raw)
  To: dev
  Cc: rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, keesang.song, aman.kumar, jerinjacobk

add non temporal load and temporal store for mprq memcpy.
This utilizes AMD EPYC2 optimized rte_memcpy* routines and
only enabled if config/x86/x86_amd_epyc_linux_gcc is build.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/mlx5_rx.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index 2b7ad3e48b..cda6aa02f2 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -422,6 +422,14 @@ mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
 	const uint32_t offset = strd_idx * strd_sz + strd_shift;
 	void *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
 
+#ifdef RTE_MEMCPY_AMDEPYC
+	if (len <= rxq->mprq_max_memcpy_len) {
+		rte_prefetch1(addr);
+		if (len > RTE_CACHE_LINE_SIZE)
+			rte_prefetch2((void *)((uintptr_t)addr +
+					       RTE_CACHE_LINE_SIZE));
+	}
+#endif
 	/*
 	 * Memcpy packets to the target mbuf if:
 	 * - The size of packet is smaller than mprq_max_memcpy_len.
@@ -433,8 +441,19 @@ mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
 	    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 		if (likely(len <=
 			   (uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) {
+#ifdef RTE_MEMCPY_AMDEPYC
+			uintptr_t data_addr;
+
+			data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+			if (!((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK))
+				rte_memcpy_aligned_tstore16((void *)data_addr,
+					   addr, len);
+			else
+				rte_memcpy((void *)data_addr, addr, len);
+#else
 			rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
 				   addr, len);
+#endif
 			DATA_LEN(pkt) = len;
 		} else if (rxq->strd_scatter_en) {
 			struct rte_mbuf *prev = pkt;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms
  2021-10-13 16:53   ` Thomas Monjalon
@ 2021-10-19 10:52     ` Aman Kumar
  0 siblings, 0 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-19 10:52 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dpdk-dev, Raslan Darawsheh, Asaf Penso, Shy Shyman,
	Slava Ovsiienko, Alexander Kozyrev, Matan Azrad, Anatoly Burakov,
	Song, Keesang

Dear Thomas,

Please check V2 patchset
<https://patchwork.dpdk.org/project/dpdk/patch/20211019104724.19416-1-aman.kumar@vvdntech.in/>.
Upon suggestions from Jerin, we've moved this build option using
config/x86/x86_amd_epyc_linux_gcc cross-file.
This option is not a compilation option and enabled by default in V2, once
it is cross built. Please let us know your comments.

*With Best Regards*
Aman Kumar
VVDN Technologies Pvt. Ltd.
*web:* www.vvdntech.com

On Wed, Oct 13, 2021 at 10:23 PM Thomas Monjalon <thomas@monjalon.net>
wrote:

> 23/08/2021 10:44, Aman Kumar:
> > add non temporal load and temporal store for mprq memcpy.
> > define mlx5_ntload_tstore in meson build configuration to
> > enable this optimization. This utilizes AMD EPYC2 optimized
> > rte_memcpy* routines.
> [...]
> > +option('mlx5_ntload_tstore', type: 'boolean', value: false, description:
> > +       'to enable optimized MPRQ in RX datapath')
>
> Please don't make it a compilation option.
> Why isn't it always enabled?
>
> There was a comment on the first patch.
> Do you plan to make a new version?
>
>
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
  2021-10-19 10:47   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 plaform Aman Kumar
@ 2021-10-19 12:31   ` Thomas Monjalon
  2021-10-19 15:35     ` Stephen Hemminger
  2021-10-21 17:10     ` Song, Keesang
  2021-10-21 20:14   ` Thomas Monjalon
  2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
  3 siblings, 2 replies; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-19 12:31 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, keesang.song, aman.kumar, jerinjacobk,
	bruce.richardson, konstantin.ananyev, david.marchand

19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for
> AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> as cross-file with meson to build dpdk for AMD EPYC platforms.

Please split in 2 patches: platform & memcpy.

What optimization is specific to EPYC?

I dislike the asm code below.
What is AMD specific inside?
Can it use compiler intrinsics as it is done elsewhere?

> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +					    const void *src,
> +					    size_t size)
> +{
> +	asm volatile goto("movq %0, %%rsi\n\t"
> +	"movq %1, %%rdi\n\t"
> +	"movq %2, %%rdx\n\t"
> +	"cmpq   $(128), %%rdx\n\t"
> +	"jb     202f\n\t"
> +	"201:\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +	"vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +	"vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +	"vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +	"addq   $128, %%rsi\n\t"
> +	"addq   $128, %%rdi\n\t"
> +	"subq   $128, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +	"jae    201b\n\t"
> +	"202:\n\t"
> +	"cmpq   $64, %%rdx\n\t"
> +	"jb     203f\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +	"addq   $64, %%rsi\n\t"
> +	"addq   $64, %%rdi\n\t"
> +	"subq   $64, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"203:\n\t"
> +	"cmpq   $32, %%rdx\n\t"
> +	"jb     204f\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"addq   $32, %%rsi\n\t"
> +	"addq   $32, %%rdi\n\t"
> +	"subq   $32, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"204:\n\t"
> +	"cmpb   $16, %%dl\n\t"
> +	"jb     205f\n\t"
> +	"vmovntdqa (%%rsi), %%xmm0\n\t"
> +	"vmovdqu  %%xmm0, (%%rdi)\n\t"
> +	"addq   $16, %%rsi\n\t"
> +	"addq   $16, %%rdi\n\t"
> +	"subq   $16, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"205:\n\t"
> +	"cmpb   $2, %%dl\n\t"
> +	"jb     208f\n\t"
> +	"cmpb   $4, %%dl\n\t"
> +	"jbe    207f\n\t"
> +	"cmpb   $8, %%dl\n\t"
> +	"jbe    206f\n\t"
> +	"movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +	"movq   (%%rsi), %%rsi\n\t"
> +	"movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +	"movq   %%rsi, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"206:\n\t"
> +	"movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +	"movl   (%%rsi), %%esi\n\t"
> +	"movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +	"movl   %%esi, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"207:\n\t"
> +	"movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +	"movzwl (%%rsi), %%esi\n\t"
> +	"movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +	"movw   %%si, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"208:\n\t"
> +	"movzbl (%%rsi), %%ecx\n\t"
> +	"movb   %%cl, (%%rdi)"
> +	:
> +	: "r"(src), "r"(dst), "r"(size)
> +	: "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +	: done
> +	);
> +done:
> +	return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len)
> +{
> +	asm goto("movq	%0, %%rsi\n\t"
> +	"movq	%1, %%rdi\n\t"
> +	"movq	%2, %%rdx\n\t"
> +	"movq    %%rdi, %%rax\n\t"
> +	"cmp     $32, %%rdx\n\t"
> +	"jb      101f\n\t"
> +	"cmp     $(32 * 2), %%rdx\n\t"
> +	"ja      108f\n\t"
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"101:\n\t"
> +	/* Less than 1 VEC.  */
> +	"cmpb    $32, %%dl\n\t"
> +	"jae     103f\n\t"
> +	"cmpb    $16, %%dl\n\t"
> +	"jae     104f\n\t"
> +	"cmpb    $8, %%dl\n\t"
> +	"jae     105f\n\t"
> +	"cmpb    $4, %%dl\n\t"
> +	"jae     106f\n\t"
> +	"cmpb    $1, %%dl\n\t"
> +	"ja      107f\n\t"
> +	"jb      102f\n\t"
> +	"movzbl  (%%rsi), %%ecx\n\t"
> +	"movb    %%cl, (%%rdi)\n\t"
> +	"102:\n\t"
> +	"jmp %l[done]\n\t"
> +	"103:\n\t"
> +	/* From 32 to 63.  No branch when size == 32.  */
> +	"vmovdqu (%%rsi), %%ymm0\n\t"
> +	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +	"vmovdqu %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	/* From 16 to 31.  No branch when size == 16.  */
> +	"104:\n\t"
> +	"vmovdqu (%%rsi), %%xmm0\n\t"
> +	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +	"vmovdqu %%xmm0, (%%rdi)\n\t"
> +	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +	"jmp %l[done]\n\t"
> +	"105:\n\t"
> +	/* From 8 to 15.  No branch when size == 8.  */
> +	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +	"movq    (%%rsi), %%rsi\n\t"
> +	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +	"movq    %%rsi, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"106:\n\t"
> +	/* From 4 to 7.  No branch when size == 4.  */
> +	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +	"movl    (%%rsi), %%esi\n\t"
> +	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +	"movl    %%esi, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"107:\n\t"
> +	/* From 2 to 3.  No branch when size == 2.  */
> +	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +	"movzwl  (%%rsi), %%esi\n\t"
> +	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +	"movw    %%si, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"108:\n\t"
> +	/* More than 2 * VEC and there may be overlap between destination */
> +	/* and source.  */
> +	"cmpq    $(32 * 8), %%rdx\n\t"
> +	"ja      111f\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"jb      109f\n\t"
> +	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"109:\n\t"
> +	/* Copy from 2 * VEC to 4 * VEC. */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"110:\n\t"
> +	"jmp %l[done]\n\t"
> +	"111:\n\t"
> +	"cmpq    %%rsi, %%rdi\n\t"
> +	"ja      113f\n\t"
> +	/* Source == destination is less common.  */
> +	"je      110b\n\t"
> +	/* Load the first VEC and last 4 * VEC to
> +	 * support overlapping addresses.
> +	 */
> +	"vmovdqu   (%%rsi), %%ymm4\n\t"
> +	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +	/* Save start and stop of the destination buffer.  */
> +	"movq    %%rdi, %%r11\n\t"
> +	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +	/* Align destination for aligned stores in the loop.  Compute */
> +	/* how much destination is misaligned.  */
> +	"movq    %%rdi, %%r8\n\t"
> +	"andq    $(32 - 1), %%r8\n\t"
> +	/* Get the negative of offset for alignment.  */
> +	"subq    $32, %%r8\n\t"
> +	/* Adjust source.  */
> +	"subq    %%r8, %%rsi\n\t"
> +	/* Adjust destination which should be aligned now.  */
> +	"subq    %%r8, %%rdi\n\t"
> +	/* Adjust length.  */
> +	"addq    %%r8, %%rdx\n\t"
> +	/* Check non-temporal store threshold.  */
> +	"cmpq	 $(1024*1024), %%rdx\n\t"
> +	"ja      115f\n\t"
> +	"112:\n\t"
> +	/* Copy 4 * VEC a time forward.  */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"addq    $(32 * 4), %%rsi\n\t"
> +	"subq    $(32 * 4), %%rdx\n\t"
> +	"vmovdqa   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"addq    $(32 * 4), %%rdi\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      112b\n\t"
> +	/* Store the last 4 * VEC.  */
> +	"vmovdqu   %%ymm5, (%%rcx)\n\t"
> +	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +	/* Store the first VEC.  */
> +	"vmovdqu   %%ymm4, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"113:\n\t"
> +	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +	"vmovdqu   (%%rsi), %%ymm4\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm5\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +	/* Save stop of the destination buffer.  */
> +	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +	/* Align destination end for aligned stores in the loop.  Compute */
> +	/* how much destination end is misaligned.  */
> +	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +	"movq    %%r11, %%r9\n\t"
> +	"movq    %%r11, %%r8\n\t"
> +	"andq    $(32 - 1), %%r8\n\t"
> +	/* Adjust source.  */
> +	"subq    %%r8, %%rcx\n\t"
> +	/* Adjust the end of destination which should be aligned now.  */
> +	"subq    %%r8, %%r9\n\t"
> +	/* Adjust length.  */
> +	"subq    %%r8, %%rdx\n\t"
> +	 /* Check non-temporal store threshold.  */
> +	"cmpq	 $(1024*1024), %%rdx\n\t"
> +	"ja      117f\n\t"
> +	"114:\n\t"
> +	/* Copy 4 * VEC a time backward.  */
> +	"vmovdqu   (%%rcx), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +	"subq    $(32 * 4), %%rcx\n\t"
> +	"subq    $(32 * 4), %%rdx\n\t"
> +	"vmovdqa   %%ymm0, (%%r9)\n\t"
> +	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +	"subq    $(32 * 4), %%r9\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      114b\n\t"
> +	/* Store the first 4 * VEC. */
> +	"vmovdqu   %%ymm4, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +	/* Store the last VEC. */
> +	"vmovdqu   %%ymm8, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +
> +	"115:\n\t"
> +	/* Don't use non-temporal store if there is overlap between */
> +	/* destination and source since destination may be in cache */
> +	/* when source is loaded. */
> +	"leaq    (%%rdi, %%rdx), %%r10\n\t"
> +	"cmpq    %%r10, %%rsi\n\t"
> +	"jb      112b\n\t"
> +	"116:\n\t"
> +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> +	"prefetcht0 (32*4*2)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*3)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"addq    $(32*4), %%rsi\n\t"
> +	"subq    $(32*4), %%rdx\n\t"
> +	"vmovntdq  %%ymm0, (%%rdi)\n\t"
> +	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"addq    $(32*4), %%rdi\n\t"
> +	"cmpq    $(32*4), %%rdx\n\t"
> +	"ja      116b\n\t"
> +	"sfence\n\t"
> +	/* Store the last 4 * VEC.  */
> +	"vmovdqu   %%ymm5, (%%rcx)\n\t"
> +	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +	/* Store the first VEC.  */
> +	"vmovdqu   %%ymm4, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"117:\n\t"
> +	/* Don't use non-temporal store if there is overlap between */
> +	/* destination and source since destination may be in cache */
> +	/* when source is loaded.  */
> +	"leaq    (%%rcx, %%rdx), %%r10\n\t"
> +	"cmpq    %%r10, %%r9\n\t"
> +	"jb      114b\n\t"
> +	"118:\n\t"
> +	/* Copy 4 * VEC a time backward with non-temporal stores. */
> +	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +	"vmovdqu   (%%rcx), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +	"subq    $(32*4), %%rcx\n\t"
> +	"subq    $(32*4), %%rdx\n\t"
> +	"vmovntdq  %%ymm0, (%%r9)\n\t"
> +	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +	"subq    $(32 * 4), %%r9\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      118b\n\t"
> +	"sfence\n\t"
> +	/* Store the first 4 * VEC.  */
> +	"vmovdqu   %%ymm4, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +	/* Store the last VEC.  */
> +	"vmovdqu   %%ymm8, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]"
> +	:
> +	: "r"(src), "r"(dst), "r"(len)
> +	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> +	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +	: done
> +	);
> +done:
> +	return dst;
> +}




^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-19 12:31   ` [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal Thomas Monjalon
@ 2021-10-19 15:35     ` Stephen Hemminger
  2021-10-21 17:10     ` Song, Keesang
  1 sibling, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2021-10-19 15:35 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Aman Kumar, dev, rasland, asafp, shys, viacheslavo, akozyrev,
	matan, anatoly.burakov, keesang.song, jerinjacobk,
	bruce.richardson, konstantin.ananyev, david.marchand

On Tue, 19 Oct 2021 14:31:01 +0200
Thomas Monjalon <thomas@monjalon.net> wrote:

> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for
> > AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> > as cross-file with meson to build dpdk for AMD EPYC platforms.  
> 
> Please split in 2 patches: platform & memcpy.
> 
> What optimization is specific to EPYC?
> 
> I dislike the asm code below.
> What is AMD specific inside?
> Can it use compiler intrinsics as it is done elsewhere?

And why is this not done by Gcc?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-19 12:31   ` [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal Thomas Monjalon
  2021-10-19 15:35     ` Stephen Hemminger
@ 2021-10-21 17:10     ` Song, Keesang
  2021-10-21 17:40       ` Ananyev, Konstantin
  1 sibling, 1 reply; 43+ messages in thread
From: Song, Keesang @ 2021-10-21 17:10 UTC (permalink / raw)
  To: Thomas Monjalon, Aman Kumar
  Cc: dev, rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, aman.kumar, jerinjacobk, bruce.richardson,
	konstantin.ananyev, david.marchand

[AMD Official Use Only]

Hi Thomas,

I hope this can make some explanation to your question.
We(AMD Linux library support team) have implemented the custom tailored memcpy solution which is a close match with DPDK use case requirements like the below.
1)      Min 64B length data packet with cache aligned Source and Destination.
2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
3)      This solution works for all AVX2 supported AMD machines.

Internally we have completed the integrity testing and benchmarking of the solution and found gains of 8.4% to 14.5% specifically on Milan CPU(3rd Gen of EPYC Processor)

Thanks for your support,
Keesang

-----Original Message-----
From: Thomas Monjalon <thomas@monjalon.net>
Sent: Tuesday, October 19, 2021 5:31 AM
To: Aman Kumar <aman.kumar@vvdntech.in>
Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang <Keesang.Song@amd.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; bruce.richardson@intel.com; konstantin.ananyev@intel.com; david.marchand@redhat.com
Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal

[CAUTION: External Email]

19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for AMD EPYC
> platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> meson to build dpdk for AMD EPYC platforms.

Please split in 2 patches: platform & memcpy.

What optimization is specific to EPYC?

I dislike the asm code below.
What is AMD specific inside?
Can it use compiler intrinsics as it is done elsewhere?

> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +                                         const void *src,
> +                                         size_t size) {
> +     asm volatile goto("movq %0, %%rsi\n\t"
> +     "movq %1, %%rdi\n\t"
> +     "movq %2, %%rdx\n\t"
> +     "cmpq   $(128), %%rdx\n\t"
> +     "jb     202f\n\t"
> +     "201:\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +     "addq   $128, %%rsi\n\t"
> +     "addq   $128, %%rdi\n\t"
> +     "subq   $128, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +     "jae    201b\n\t"
> +     "202:\n\t"
> +     "cmpq   $64, %%rdx\n\t"
> +     "jb     203f\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +     "addq   $64, %%rsi\n\t"
> +     "addq   $64, %%rdi\n\t"
> +     "subq   $64, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "203:\n\t"
> +     "cmpq   $32, %%rdx\n\t"
> +     "jb     204f\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "addq   $32, %%rsi\n\t"
> +     "addq   $32, %%rdi\n\t"
> +     "subq   $32, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "204:\n\t"
> +     "cmpb   $16, %%dl\n\t"
> +     "jb     205f\n\t"
> +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> +     "addq   $16, %%rsi\n\t"
> +     "addq   $16, %%rdi\n\t"
> +     "subq   $16, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "205:\n\t"
> +     "cmpb   $2, %%dl\n\t"
> +     "jb     208f\n\t"
> +     "cmpb   $4, %%dl\n\t"
> +     "jbe    207f\n\t"
> +     "cmpb   $8, %%dl\n\t"
> +     "jbe    206f\n\t"
> +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +     "movq   (%%rsi), %%rsi\n\t"
> +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +     "movq   %%rsi, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "206:\n\t"
> +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +     "movl   (%%rsi), %%esi\n\t"
> +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +     "movl   %%esi, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "207:\n\t"
> +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +     "movzwl (%%rsi), %%esi\n\t"
> +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +     "movw   %%si, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "208:\n\t"
> +     "movzbl (%%rsi), %%ecx\n\t"
> +     "movb   %%cl, (%%rdi)"
> +     :
> +     : "r"(src), "r"(dst), "r"(size)
> +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +     : done
> +     );
> +done:
> +     return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> +     asm goto("movq  %0, %%rsi\n\t"
> +     "movq   %1, %%rdi\n\t"
> +     "movq   %2, %%rdx\n\t"
> +     "movq    %%rdi, %%rax\n\t"
> +     "cmp     $32, %%rdx\n\t"
> +     "jb      101f\n\t"
> +     "cmp     $(32 * 2), %%rdx\n\t"
> +     "ja      108f\n\t"
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "101:\n\t"
> +     /* Less than 1 VEC.  */
> +     "cmpb    $32, %%dl\n\t"
> +     "jae     103f\n\t"
> +     "cmpb    $16, %%dl\n\t"
> +     "jae     104f\n\t"
> +     "cmpb    $8, %%dl\n\t"
> +     "jae     105f\n\t"
> +     "cmpb    $4, %%dl\n\t"
> +     "jae     106f\n\t"
> +     "cmpb    $1, %%dl\n\t"
> +     "ja      107f\n\t"
> +     "jb      102f\n\t"
> +     "movzbl  (%%rsi), %%ecx\n\t"
> +     "movb    %%cl, (%%rdi)\n\t"
> +     "102:\n\t"
> +     "jmp %l[done]\n\t"
> +     "103:\n\t"
> +     /* From 32 to 63.  No branch when size == 32.  */
> +     "vmovdqu (%%rsi), %%ymm0\n\t"
> +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     /* From 16 to 31.  No branch when size == 16.  */
> +     "104:\n\t"
> +     "vmovdqu (%%rsi), %%xmm0\n\t"
> +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +     "jmp %l[done]\n\t"
> +     "105:\n\t"
> +     /* From 8 to 15.  No branch when size == 8.  */
> +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +     "movq    (%%rsi), %%rsi\n\t"
> +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +     "movq    %%rsi, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "106:\n\t"
> +     /* From 4 to 7.  No branch when size == 4.  */
> +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +     "movl    (%%rsi), %%esi\n\t"
> +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +     "movl    %%esi, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "107:\n\t"
> +     /* From 2 to 3.  No branch when size == 2.  */
> +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +     "movzwl  (%%rsi), %%esi\n\t"
> +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +     "movw    %%si, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "108:\n\t"
> +     /* More than 2 * VEC and there may be overlap between destination */
> +     /* and source.  */
> +     "cmpq    $(32 * 8), %%rdx\n\t"
> +     "ja      111f\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "jb      109f\n\t"
> +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "109:\n\t"
> +     /* Copy from 2 * VEC to 4 * VEC. */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "110:\n\t"
> +     "jmp %l[done]\n\t"
> +     "111:\n\t"
> +     "cmpq    %%rsi, %%rdi\n\t"
> +     "ja      113f\n\t"
> +     /* Source == destination is less common.  */
> +     "je      110b\n\t"
> +     /* Load the first VEC and last 4 * VEC to
> +      * support overlapping addresses.
> +      */
> +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +     /* Save start and stop of the destination buffer.  */
> +     "movq    %%rdi, %%r11\n\t"
> +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +     /* Align destination for aligned stores in the loop.  Compute */
> +     /* how much destination is misaligned.  */
> +     "movq    %%rdi, %%r8\n\t"
> +     "andq    $(32 - 1), %%r8\n\t"
> +     /* Get the negative of offset for alignment.  */
> +     "subq    $32, %%r8\n\t"
> +     /* Adjust source.  */
> +     "subq    %%r8, %%rsi\n\t"
> +     /* Adjust destination which should be aligned now.  */
> +     "subq    %%r8, %%rdi\n\t"
> +     /* Adjust length.  */
> +     "addq    %%r8, %%rdx\n\t"
> +     /* Check non-temporal store threshold.  */
> +     "cmpq    $(1024*1024), %%rdx\n\t"
> +     "ja      115f\n\t"
> +     "112:\n\t"
> +     /* Copy 4 * VEC a time forward.  */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "addq    $(32 * 4), %%rsi\n\t"
> +     "subq    $(32 * 4), %%rdx\n\t"
> +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "addq    $(32 * 4), %%rdi\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      112b\n\t"
> +     /* Store the last 4 * VEC.  */
> +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +     /* Store the first VEC.  */
> +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "113:\n\t"
> +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +     /* Save stop of the destination buffer.  */
> +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +     /* Align destination end for aligned stores in the loop.  Compute */
> +     /* how much destination end is misaligned.  */
> +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +     "movq    %%r11, %%r9\n\t"
> +     "movq    %%r11, %%r8\n\t"
> +     "andq    $(32 - 1), %%r8\n\t"
> +     /* Adjust source.  */
> +     "subq    %%r8, %%rcx\n\t"
> +     /* Adjust the end of destination which should be aligned now.  */
> +     "subq    %%r8, %%r9\n\t"
> +     /* Adjust length.  */
> +     "subq    %%r8, %%rdx\n\t"
> +      /* Check non-temporal store threshold.  */
> +     "cmpq    $(1024*1024), %%rdx\n\t"
> +     "ja      117f\n\t"
> +     "114:\n\t"
> +     /* Copy 4 * VEC a time backward.  */
> +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +     "subq    $(32 * 4), %%rcx\n\t"
> +     "subq    $(32 * 4), %%rdx\n\t"
> +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +     "subq    $(32 * 4), %%r9\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      114b\n\t"
> +     /* Store the first 4 * VEC. */
> +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +     /* Store the last VEC. */
> +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +
> +     "115:\n\t"
> +     /* Don't use non-temporal store if there is overlap between */
> +     /* destination and source since destination may be in cache */
> +     /* when source is loaded. */
> +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> +     "cmpq    %%r10, %%rsi\n\t"
> +     "jb      112b\n\t"
> +     "116:\n\t"
> +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "addq    $(32*4), %%rsi\n\t"
> +     "subq    $(32*4), %%rdx\n\t"
> +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "addq    $(32*4), %%rdi\n\t"
> +     "cmpq    $(32*4), %%rdx\n\t"
> +     "ja      116b\n\t"
> +     "sfence\n\t"
> +     /* Store the last 4 * VEC.  */
> +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +     /* Store the first VEC.  */
> +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "117:\n\t"
> +     /* Don't use non-temporal store if there is overlap between */
> +     /* destination and source since destination may be in cache */
> +     /* when source is loaded.  */
> +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> +     "cmpq    %%r10, %%r9\n\t"
> +     "jb      114b\n\t"
> +     "118:\n\t"
> +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +     "subq    $(32*4), %%rcx\n\t"
> +     "subq    $(32*4), %%rdx\n\t"
> +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +     "subq    $(32 * 4), %%r9\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      118b\n\t"
> +     "sfence\n\t"
> +     /* Store the first 4 * VEC.  */
> +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +     /* Store the last VEC.  */
> +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]"
> +     :
> +     : "r"(src), "r"(dst), "r"(len)
> +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +     : done
> +     );
> +done:
> +     return dst;
> +}




^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-21 17:10     ` Song, Keesang
@ 2021-10-21 17:40       ` Ananyev, Konstantin
  2021-10-21 18:12         ` Song, Keesang
  0 siblings, 1 reply; 43+ messages in thread
From: Ananyev, Konstantin @ 2021-10-21 17:40 UTC (permalink / raw)
  To: Song, Keesang, Thomas Monjalon, Aman Kumar
  Cc: dev, rasland, asafp, shys, viacheslavo, akozyrev, matan, Burakov,
	Anatoly, aman.kumar, jerinjacobk, Richardson, Bruce,
	david.marchand


> 
> Hi Thomas,
> 
> I hope this can make some explanation to your question.
> We(AMD Linux library support team) have implemented the custom tailored memcpy solution which is a close match with DPDK use case
> requirements like the below.
> 1)      Min 64B length data packet with cache aligned Source and Destination.
> 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> 3)      This solution works for all AVX2 supported AMD machines.
> 
> Internally we have completed the integrity testing and benchmarking of the solution and found gains of 8.4% to 14.5% specifically on Milan
> CPU(3rd Gen of EPYC Processor)

It still not clear to me why it has to be written in assembler.
Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?

> 
> Thanks for your support,
> Keesang
> 
> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Tuesday, October 19, 2021 5:31 AM
> To: Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang <Keesang.Song@amd.com>; aman.kumar@vvdntech.in;
> jerinjacobk@gmail.com; bruce.richardson@intel.com; konstantin.ananyev@intel.com; david.marchand@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
> 
> [CAUTION: External Email]
> 
> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> > meson to build dpdk for AMD EPYC platforms.
> 
> Please split in 2 patches: platform & memcpy.
> 
> What optimization is specific to EPYC?
> 
> I dislike the asm code below.
> What is AMD specific inside?
> Can it use compiler intrinsics as it is done elsewhere?
> 
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > +                                         const void *src,
> > +                                         size_t size) {
> > +     asm volatile goto("movq %0, %%rsi\n\t"
> > +     "movq %1, %%rdi\n\t"
> > +     "movq %2, %%rdx\n\t"
> > +     "cmpq   $(128), %%rdx\n\t"
> > +     "jb     202f\n\t"
> > +     "201:\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > +     "addq   $128, %%rsi\n\t"
> > +     "addq   $128, %%rdi\n\t"
> > +     "subq   $128, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > +     "jae    201b\n\t"
> > +     "202:\n\t"
> > +     "cmpq   $64, %%rdx\n\t"
> > +     "jb     203f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "addq   $64, %%rsi\n\t"
> > +     "addq   $64, %%rdi\n\t"
> > +     "subq   $64, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "203:\n\t"
> > +     "cmpq   $32, %%rdx\n\t"
> > +     "jb     204f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "addq   $32, %%rsi\n\t"
> > +     "addq   $32, %%rdi\n\t"
> > +     "subq   $32, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "204:\n\t"
> > +     "cmpb   $16, %%dl\n\t"
> > +     "jb     205f\n\t"
> > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > +     "addq   $16, %%rsi\n\t"
> > +     "addq   $16, %%rdi\n\t"
> > +     "subq   $16, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "205:\n\t"
> > +     "cmpb   $2, %%dl\n\t"
> > +     "jb     208f\n\t"
> > +     "cmpb   $4, %%dl\n\t"
> > +     "jbe    207f\n\t"
> > +     "cmpb   $8, %%dl\n\t"
> > +     "jbe    206f\n\t"
> > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq   (%%rsi), %%rsi\n\t"
> > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq   %%rsi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "206:\n\t"
> > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl   (%%rsi), %%esi\n\t"
> > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl   %%esi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "207:\n\t"
> > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl (%%rsi), %%esi\n\t"
> > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw   %%si, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "208:\n\t"
> > +     "movzbl (%%rsi), %%ecx\n\t"
> > +     "movb   %%cl, (%%rdi)"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(size)
> > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> > +     asm goto("movq  %0, %%rsi\n\t"
> > +     "movq   %1, %%rdi\n\t"
> > +     "movq   %2, %%rdx\n\t"
> > +     "movq    %%rdi, %%rax\n\t"
> > +     "cmp     $32, %%rdx\n\t"
> > +     "jb      101f\n\t"
> > +     "cmp     $(32 * 2), %%rdx\n\t"
> > +     "ja      108f\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "101:\n\t"
> > +     /* Less than 1 VEC.  */
> > +     "cmpb    $32, %%dl\n\t"
> > +     "jae     103f\n\t"
> > +     "cmpb    $16, %%dl\n\t"
> > +     "jae     104f\n\t"
> > +     "cmpb    $8, %%dl\n\t"
> > +     "jae     105f\n\t"
> > +     "cmpb    $4, %%dl\n\t"
> > +     "jae     106f\n\t"
> > +     "cmpb    $1, %%dl\n\t"
> > +     "ja      107f\n\t"
> > +     "jb      102f\n\t"
> > +     "movzbl  (%%rsi), %%ecx\n\t"
> > +     "movb    %%cl, (%%rdi)\n\t"
> > +     "102:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "103:\n\t"
> > +     /* From 32 to 63.  No branch when size == 32.  */
> > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     /* From 16 to 31.  No branch when size == 16.  */
> > +     "104:\n\t"
> > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "105:\n\t"
> > +     /* From 8 to 15.  No branch when size == 8.  */
> > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq    (%%rsi), %%rsi\n\t"
> > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq    %%rsi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "106:\n\t"
> > +     /* From 4 to 7.  No branch when size == 4.  */
> > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl    (%%rsi), %%esi\n\t"
> > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl    %%esi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "107:\n\t"
> > +     /* From 2 to 3.  No branch when size == 2.  */
> > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl  (%%rsi), %%esi\n\t"
> > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw    %%si, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "108:\n\t"
> > +     /* More than 2 * VEC and there may be overlap between destination */
> > +     /* and source.  */
> > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > +     "ja      111f\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "jb      109f\n\t"
> > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "109:\n\t"
> > +     /* Copy from 2 * VEC to 4 * VEC. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "110:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "111:\n\t"
> > +     "cmpq    %%rsi, %%rdi\n\t"
> > +     "ja      113f\n\t"
> > +     /* Source == destination is less common.  */
> > +     "je      110b\n\t"
> > +     /* Load the first VEC and last 4 * VEC to
> > +      * support overlapping addresses.
> > +      */
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > +     /* Save start and stop of the destination buffer.  */
> > +     "movq    %%rdi, %%r11\n\t"
> > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > +     /* Align destination for aligned stores in the loop.  Compute */
> > +     /* how much destination is misaligned.  */
> > +     "movq    %%rdi, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Get the negative of offset for alignment.  */
> > +     "subq    $32, %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rsi\n\t"
> > +     /* Adjust destination which should be aligned now.  */
> > +     "subq    %%r8, %%rdi\n\t"
> > +     /* Adjust length.  */
> > +     "addq    %%r8, %%rdx\n\t"
> > +     /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      115f\n\t"
> > +     "112:\n\t"
> > +     /* Copy 4 * VEC a time forward.  */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32 * 4), %%rsi\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32 * 4), %%rdi\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      112b\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "113:\n\t"
> > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > +     /* Save stop of the destination buffer.  */
> > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > +     /* Align destination end for aligned stores in the loop.  Compute */
> > +     /* how much destination end is misaligned.  */
> > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > +     "movq    %%r11, %%r9\n\t"
> > +     "movq    %%r11, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rcx\n\t"
> > +     /* Adjust the end of destination which should be aligned now.  */
> > +     "subq    %%r8, %%r9\n\t"
> > +     /* Adjust length.  */
> > +     "subq    %%r8, %%rdx\n\t"
> > +      /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      117f\n\t"
> > +     "114:\n\t"
> > +     /* Copy 4 * VEC a time backward.  */
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32 * 4), %%rcx\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      114b\n\t"
> > +     /* Store the first 4 * VEC. */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC. */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +
> > +     "115:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded. */
> > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%rsi\n\t"
> > +     "jb      112b\n\t"
> > +     "116:\n\t"
> > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32*4), %%rsi\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32*4), %%rdi\n\t"
> > +     "cmpq    $(32*4), %%rdx\n\t"
> > +     "ja      116b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "117:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded.  */
> > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%r9\n\t"
> > +     "jb      114b\n\t"
> > +     "118:\n\t"
> > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32*4), %%rcx\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      118b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the first 4 * VEC.  */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC.  */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(len)
> > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
> 
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-21 17:40       ` Ananyev, Konstantin
@ 2021-10-21 18:12         ` Song, Keesang
  2021-10-21 18:41           ` Thomas Monjalon
  0 siblings, 1 reply; 43+ messages in thread
From: Song, Keesang @ 2021-10-21 18:12 UTC (permalink / raw)
  To: Ananyev, Konstantin, Thomas Monjalon, Aman Kumar
  Cc: dev, rasland, asafp, shys, viacheslavo, akozyrev, matan, Burakov,
	Anatoly, aman.kumar, jerinjacobk, Richardson, Bruce,
	david.marchand

[AMD Official Use Only]

Hi Ananyev,

The current memcpy implementation in Glibc is based out of assembly coding.
Although memcpy could have been implemented with intrinsic, but since our AMD library developers are working on the Glibc functions, they have provided a tailored implementation based out of inline assembly coding.

Thanks for your support,
Keesang

-----Original Message-----
From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
Sent: Thursday, October 21, 2021 10:40 AM
To: Song, Keesang <Keesang.Song@amd.com>; Thomas Monjalon <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; david.marchand@redhat.com
Subject: RE: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal

[AMD Official Use Only]

[CAUTION: External Email]

>
> Hi Thomas,
>
> I hope this can make some explanation to your question.
> We(AMD Linux library support team) have implemented the custom
> tailored memcpy solution which is a close match with DPDK use case requirements like the below.
> 1)      Min 64B length data packet with cache aligned Source and Destination.
> 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> 3)      This solution works for all AVX2 supported AMD machines.
>
> Internally we have completed the integrity testing and benchmarking of
> the solution and found gains of 8.4% to 14.5% specifically on Milan
> CPU(3rd Gen of EPYC Processor)

It still not clear to me why it has to be written in assembler.
Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?

>
> Thanks for your support,
> Keesang
>
> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Tuesday, October 19, 2021 5:31 AM
> To: Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang
> <Keesang.Song@amd.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com;
> bruce.richardson@intel.com; konstantin.ananyev@intel.com;
> david.marchand@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> routine to eal
>
> [CAUTION: External Email]
>
> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> > meson to build dpdk for AMD EPYC platforms.
>
> Please split in 2 patches: platform & memcpy.
>
> What optimization is specific to EPYC?
>
> I dislike the asm code below.
> What is AMD specific inside?
> Can it use compiler intrinsics as it is done elsewhere?
>
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > +                                         const void *src,
> > +                                         size_t size) {
> > +     asm volatile goto("movq %0, %%rsi\n\t"
> > +     "movq %1, %%rdi\n\t"
> > +     "movq %2, %%rdx\n\t"
> > +     "cmpq   $(128), %%rdx\n\t"
> > +     "jb     202f\n\t"
> > +     "201:\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > +     "addq   $128, %%rsi\n\t"
> > +     "addq   $128, %%rdi\n\t"
> > +     "subq   $128, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > +     "jae    201b\n\t"
> > +     "202:\n\t"
> > +     "cmpq   $64, %%rdx\n\t"
> > +     "jb     203f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "addq   $64, %%rsi\n\t"
> > +     "addq   $64, %%rdi\n\t"
> > +     "subq   $64, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "203:\n\t"
> > +     "cmpq   $32, %%rdx\n\t"
> > +     "jb     204f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "addq   $32, %%rsi\n\t"
> > +     "addq   $32, %%rdi\n\t"
> > +     "subq   $32, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "204:\n\t"
> > +     "cmpb   $16, %%dl\n\t"
> > +     "jb     205f\n\t"
> > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > +     "addq   $16, %%rsi\n\t"
> > +     "addq   $16, %%rdi\n\t"
> > +     "subq   $16, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "205:\n\t"
> > +     "cmpb   $2, %%dl\n\t"
> > +     "jb     208f\n\t"
> > +     "cmpb   $4, %%dl\n\t"
> > +     "jbe    207f\n\t"
> > +     "cmpb   $8, %%dl\n\t"
> > +     "jbe    206f\n\t"
> > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq   (%%rsi), %%rsi\n\t"
> > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq   %%rsi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "206:\n\t"
> > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl   (%%rsi), %%esi\n\t"
> > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl   %%esi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "207:\n\t"
> > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl (%%rsi), %%esi\n\t"
> > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw   %%si, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "208:\n\t"
> > +     "movzbl (%%rsi), %%ecx\n\t"
> > +     "movb   %%cl, (%%rdi)"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(size)
> > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> > +     asm goto("movq  %0, %%rsi\n\t"
> > +     "movq   %1, %%rdi\n\t"
> > +     "movq   %2, %%rdx\n\t"
> > +     "movq    %%rdi, %%rax\n\t"
> > +     "cmp     $32, %%rdx\n\t"
> > +     "jb      101f\n\t"
> > +     "cmp     $(32 * 2), %%rdx\n\t"
> > +     "ja      108f\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "101:\n\t"
> > +     /* Less than 1 VEC.  */
> > +     "cmpb    $32, %%dl\n\t"
> > +     "jae     103f\n\t"
> > +     "cmpb    $16, %%dl\n\t"
> > +     "jae     104f\n\t"
> > +     "cmpb    $8, %%dl\n\t"
> > +     "jae     105f\n\t"
> > +     "cmpb    $4, %%dl\n\t"
> > +     "jae     106f\n\t"
> > +     "cmpb    $1, %%dl\n\t"
> > +     "ja      107f\n\t"
> > +     "jb      102f\n\t"
> > +     "movzbl  (%%rsi), %%ecx\n\t"
> > +     "movb    %%cl, (%%rdi)\n\t"
> > +     "102:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "103:\n\t"
> > +     /* From 32 to 63.  No branch when size == 32.  */
> > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     /* From 16 to 31.  No branch when size == 16.  */
> > +     "104:\n\t"
> > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "105:\n\t"
> > +     /* From 8 to 15.  No branch when size == 8.  */
> > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq    (%%rsi), %%rsi\n\t"
> > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq    %%rsi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "106:\n\t"
> > +     /* From 4 to 7.  No branch when size == 4.  */
> > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl    (%%rsi), %%esi\n\t"
> > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl    %%esi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "107:\n\t"
> > +     /* From 2 to 3.  No branch when size == 2.  */
> > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl  (%%rsi), %%esi\n\t"
> > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw    %%si, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "108:\n\t"
> > +     /* More than 2 * VEC and there may be overlap between destination */
> > +     /* and source.  */
> > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > +     "ja      111f\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "jb      109f\n\t"
> > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "109:\n\t"
> > +     /* Copy from 2 * VEC to 4 * VEC. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "110:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "111:\n\t"
> > +     "cmpq    %%rsi, %%rdi\n\t"
> > +     "ja      113f\n\t"
> > +     /* Source == destination is less common.  */
> > +     "je      110b\n\t"
> > +     /* Load the first VEC and last 4 * VEC to
> > +      * support overlapping addresses.
> > +      */
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > +     /* Save start and stop of the destination buffer.  */
> > +     "movq    %%rdi, %%r11\n\t"
> > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > +     /* Align destination for aligned stores in the loop.  Compute */
> > +     /* how much destination is misaligned.  */
> > +     "movq    %%rdi, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Get the negative of offset for alignment.  */
> > +     "subq    $32, %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rsi\n\t"
> > +     /* Adjust destination which should be aligned now.  */
> > +     "subq    %%r8, %%rdi\n\t"
> > +     /* Adjust length.  */
> > +     "addq    %%r8, %%rdx\n\t"
> > +     /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      115f\n\t"
> > +     "112:\n\t"
> > +     /* Copy 4 * VEC a time forward.  */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32 * 4), %%rsi\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32 * 4), %%rdi\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      112b\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "113:\n\t"
> > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > +     /* Save stop of the destination buffer.  */
> > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > +     /* Align destination end for aligned stores in the loop.  Compute */
> > +     /* how much destination end is misaligned.  */
> > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > +     "movq    %%r11, %%r9\n\t"
> > +     "movq    %%r11, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rcx\n\t"
> > +     /* Adjust the end of destination which should be aligned now.  */
> > +     "subq    %%r8, %%r9\n\t"
> > +     /* Adjust length.  */
> > +     "subq    %%r8, %%rdx\n\t"
> > +      /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      117f\n\t"
> > +     "114:\n\t"
> > +     /* Copy 4 * VEC a time backward.  */
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32 * 4), %%rcx\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      114b\n\t"
> > +     /* Store the first 4 * VEC. */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC. */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +
> > +     "115:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded. */
> > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%rsi\n\t"
> > +     "jb      112b\n\t"
> > +     "116:\n\t"
> > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32*4), %%rsi\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32*4), %%rdi\n\t"
> > +     "cmpq    $(32*4), %%rdx\n\t"
> > +     "ja      116b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "117:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded.  */
> > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%r9\n\t"
> > +     "jb      114b\n\t"
> > +     "118:\n\t"
> > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32*4), %%rcx\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      118b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the first 4 * VEC.  */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC.  */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(len)
> > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
>
>


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-21 18:12         ` Song, Keesang
@ 2021-10-21 18:41           ` Thomas Monjalon
  2021-10-21 19:03             ` Song, Keesang
  0 siblings, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-21 18:41 UTC (permalink / raw)
  To: Aman Kumar, Song, Keesang
  Cc: Ananyev, Konstantin, dev, rasland, asafp, shys, viacheslavo,
	akozyrev, matan, Burakov, Anatoly, aman.kumar, jerinjacobk,
	Richardson, Bruce, david.marchand

Please convert it to C code, thanks.


21/10/2021 20:12, Song, Keesang:
> [AMD Official Use Only]
> 
> Hi Ananyev,
> 
> The current memcpy implementation in Glibc is based out of assembly coding.
> Although memcpy could have been implemented with intrinsic, but since our AMD library developers are working on the Glibc functions, they have provided a tailored implementation based out of inline assembly coding.
> 
> Thanks for your support,
> Keesang
> 
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, October 21, 2021 10:40 AM
> To: Song, Keesang <Keesang.Song@amd.com>; Thomas Monjalon <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; david.marchand@redhat.com
> Subject: RE: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
> 
> [AMD Official Use Only]
> 
> [CAUTION: External Email]
> 
> >
> > Hi Thomas,
> >
> > I hope this can make some explanation to your question.
> > We(AMD Linux library support team) have implemented the custom
> > tailored memcpy solution which is a close match with DPDK use case requirements like the below.
> > 1)      Min 64B length data packet with cache aligned Source and Destination.
> > 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> > store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> > 3)      This solution works for all AVX2 supported AMD machines.
> >
> > Internally we have completed the integrity testing and benchmarking of
> > the solution and found gains of 8.4% to 14.5% specifically on Milan
> > CPU(3rd Gen of EPYC Processor)
> 
> It still not clear to me why it has to be written in assembler.
> Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?
> 
> >
> > Thanks for your support,
> > Keesang
> >
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Tuesday, October 19, 2021 5:31 AM
> > To: Aman Kumar <aman.kumar@vvdntech.in>
> > Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> > shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> > matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang
> > <Keesang.Song@amd.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com;
> > bruce.richardson@intel.com; konstantin.ananyev@intel.com;
> > david.marchand@redhat.com
> > Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> > routine to eal
> >
> > [CAUTION: External Email]
> >
> > 19/10/2021 12:47, Aman Kumar:
> > > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> > > meson to build dpdk for AMD EPYC platforms.
> >
> > Please split in 2 patches: platform & memcpy.
> >
> > What optimization is specific to EPYC?
> >
> > I dislike the asm code below.
> > What is AMD specific inside?
> > Can it use compiler intrinsics as it is done elsewhere?
> >
> > > +static __rte_always_inline void *
> > > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > > +                                         const void *src,
> > > +                                         size_t size) {
> > > +     asm volatile goto("movq %0, %%rsi\n\t"
> > > +     "movq %1, %%rdi\n\t"
> > > +     "movq %2, %%rdx\n\t"
> > > +     "cmpq   $(128), %%rdx\n\t"
> > > +     "jb     202f\n\t"
> > > +     "201:\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > > +     "addq   $128, %%rsi\n\t"
> > > +     "addq   $128, %%rdi\n\t"
> > > +     "subq   $128, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > > +     "jae    201b\n\t"
> > > +     "202:\n\t"
> > > +     "cmpq   $64, %%rdx\n\t"
> > > +     "jb     203f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "addq   $64, %%rsi\n\t"
> > > +     "addq   $64, %%rdi\n\t"
> > > +     "subq   $64, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "203:\n\t"
> > > +     "cmpq   $32, %%rdx\n\t"
> > > +     "jb     204f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "addq   $32, %%rsi\n\t"
> > > +     "addq   $32, %%rdi\n\t"
> > > +     "subq   $32, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "204:\n\t"
> > > +     "cmpb   $16, %%dl\n\t"
> > > +     "jb     205f\n\t"
> > > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > > +     "addq   $16, %%rsi\n\t"
> > > +     "addq   $16, %%rdi\n\t"
> > > +     "subq   $16, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "205:\n\t"
> > > +     "cmpb   $2, %%dl\n\t"
> > > +     "jb     208f\n\t"
> > > +     "cmpb   $4, %%dl\n\t"
> > > +     "jbe    207f\n\t"
> > > +     "cmpb   $8, %%dl\n\t"
> > > +     "jbe    206f\n\t"
> > > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq   (%%rsi), %%rsi\n\t"
> > > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq   %%rsi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "206:\n\t"
> > > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl   (%%rsi), %%esi\n\t"
> > > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl   %%esi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "207:\n\t"
> > > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl (%%rsi), %%esi\n\t"
> > > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw   %%si, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "208:\n\t"
> > > +     "movzbl (%%rsi), %%ecx\n\t"
> > > +     "movb   %%cl, (%%rdi)"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(size)
> > > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> > > +
> > > +static __rte_always_inline void *
> > > +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> > > +     asm goto("movq  %0, %%rsi\n\t"
> > > +     "movq   %1, %%rdi\n\t"
> > > +     "movq   %2, %%rdx\n\t"
> > > +     "movq    %%rdi, %%rax\n\t"
> > > +     "cmp     $32, %%rdx\n\t"
> > > +     "jb      101f\n\t"
> > > +     "cmp     $(32 * 2), %%rdx\n\t"
> > > +     "ja      108f\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "101:\n\t"
> > > +     /* Less than 1 VEC.  */
> > > +     "cmpb    $32, %%dl\n\t"
> > > +     "jae     103f\n\t"
> > > +     "cmpb    $16, %%dl\n\t"
> > > +     "jae     104f\n\t"
> > > +     "cmpb    $8, %%dl\n\t"
> > > +     "jae     105f\n\t"
> > > +     "cmpb    $4, %%dl\n\t"
> > > +     "jae     106f\n\t"
> > > +     "cmpb    $1, %%dl\n\t"
> > > +     "ja      107f\n\t"
> > > +     "jb      102f\n\t"
> > > +     "movzbl  (%%rsi), %%ecx\n\t"
> > > +     "movb    %%cl, (%%rdi)\n\t"
> > > +     "102:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "103:\n\t"
> > > +     /* From 32 to 63.  No branch when size == 32.  */
> > > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     /* From 16 to 31.  No branch when size == 16.  */
> > > +     "104:\n\t"
> > > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "105:\n\t"
> > > +     /* From 8 to 15.  No branch when size == 8.  */
> > > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq    (%%rsi), %%rsi\n\t"
> > > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq    %%rsi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "106:\n\t"
> > > +     /* From 4 to 7.  No branch when size == 4.  */
> > > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl    (%%rsi), %%esi\n\t"
> > > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl    %%esi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "107:\n\t"
> > > +     /* From 2 to 3.  No branch when size == 2.  */
> > > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl  (%%rsi), %%esi\n\t"
> > > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw    %%si, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "108:\n\t"
> > > +     /* More than 2 * VEC and there may be overlap between destination */
> > > +     /* and source.  */
> > > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > > +     "ja      111f\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "jb      109f\n\t"
> > > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "109:\n\t"
> > > +     /* Copy from 2 * VEC to 4 * VEC. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "110:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "111:\n\t"
> > > +     "cmpq    %%rsi, %%rdi\n\t"
> > > +     "ja      113f\n\t"
> > > +     /* Source == destination is less common.  */
> > > +     "je      110b\n\t"
> > > +     /* Load the first VEC and last 4 * VEC to
> > > +      * support overlapping addresses.
> > > +      */
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > > +     /* Save start and stop of the destination buffer.  */
> > > +     "movq    %%rdi, %%r11\n\t"
> > > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > > +     /* Align destination for aligned stores in the loop.  Compute */
> > > +     /* how much destination is misaligned.  */
> > > +     "movq    %%rdi, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Get the negative of offset for alignment.  */
> > > +     "subq    $32, %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rsi\n\t"
> > > +     /* Adjust destination which should be aligned now.  */
> > > +     "subq    %%r8, %%rdi\n\t"
> > > +     /* Adjust length.  */
> > > +     "addq    %%r8, %%rdx\n\t"
> > > +     /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      115f\n\t"
> > > +     "112:\n\t"
> > > +     /* Copy 4 * VEC a time forward.  */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32 * 4), %%rsi\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32 * 4), %%rdi\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      112b\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "113:\n\t"
> > > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > > +     /* Save stop of the destination buffer.  */
> > > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > > +     /* Align destination end for aligned stores in the loop.  Compute */
> > > +     /* how much destination end is misaligned.  */
> > > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > > +     "movq    %%r11, %%r9\n\t"
> > > +     "movq    %%r11, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rcx\n\t"
> > > +     /* Adjust the end of destination which should be aligned now.  */
> > > +     "subq    %%r8, %%r9\n\t"
> > > +     /* Adjust length.  */
> > > +     "subq    %%r8, %%rdx\n\t"
> > > +      /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      117f\n\t"
> > > +     "114:\n\t"
> > > +     /* Copy 4 * VEC a time backward.  */
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32 * 4), %%rcx\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      114b\n\t"
> > > +     /* Store the first 4 * VEC. */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC. */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +
> > > +     "115:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded. */
> > > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%rsi\n\t"
> > > +     "jb      112b\n\t"
> > > +     "116:\n\t"
> > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32*4), %%rsi\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32*4), %%rdi\n\t"
> > > +     "cmpq    $(32*4), %%rdx\n\t"
> > > +     "ja      116b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "117:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded.  */
> > > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%r9\n\t"
> > > +     "jb      114b\n\t"
> > > +     "118:\n\t"
> > > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32*4), %%rcx\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      118b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the first 4 * VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC.  */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(len)
> > > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> >
> >
> 
> 






^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-21 18:41           ` Thomas Monjalon
@ 2021-10-21 19:03             ` Song, Keesang
  2021-10-21 19:50               ` Thomas Monjalon
  0 siblings, 1 reply; 43+ messages in thread
From: Song, Keesang @ 2021-10-21 19:03 UTC (permalink / raw)
  To: Thomas Monjalon, Aman Kumar
  Cc: Ananyev, Konstantin, dev, rasland, asafp, shys, viacheslavo,
	akozyrev, matan, Burakov, Anatoly, aman.kumar, jerinjacobk,
	Richardson, Bruce, david.marchand

[AMD Official Use Only]

Hi Thomas,

I've already asked our AMD tools team, but they're saying they are not really familiar with C code implementation. We need your approval for now since we really need to get this patch submitted to 21.11 LTS.

Thanks,
Keesang

-----Original Message-----
From: Thomas Monjalon <thomas@monjalon.net>
Sent: Thursday, October 21, 2021 11:42 AM
To: Aman Kumar <aman.kumar@vvdntech.in>; Song, Keesang <Keesang.Song@amd.com>
Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; david.marchand@redhat.com
Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal

[CAUTION: External Email]

Please convert it to C code, thanks.


21/10/2021 20:12, Song, Keesang:
> [AMD Official Use Only]
>
> Hi Ananyev,
>
> The current memcpy implementation in Glibc is based out of assembly coding.
> Although memcpy could have been implemented with intrinsic, but since our AMD library developers are working on the Glibc functions, they have provided a tailored implementation based out of inline assembly coding.
>
> Thanks for your support,
> Keesang
>
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, October 21, 2021 10:40 AM
> To: Song, Keesang <Keesang.Song@amd.com>; Thomas Monjalon
> <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>;
> aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce
> <bruce.richardson@intel.com>; david.marchand@redhat.com
> Subject: RE: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> routine to eal
>
> [AMD Official Use Only]
>
> [CAUTION: External Email]
>
> >
> > Hi Thomas,
> >
> > I hope this can make some explanation to your question.
> > We(AMD Linux library support team) have implemented the custom
> > tailored memcpy solution which is a close match with DPDK use case requirements like the below.
> > 1)      Min 64B length data packet with cache aligned Source and Destination.
> > 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> > store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> > 3)      This solution works for all AVX2 supported AMD machines.
> >
> > Internally we have completed the integrity testing and benchmarking
> > of the solution and found gains of 8.4% to 14.5% specifically on
> > Milan CPU(3rd Gen of EPYC Processor)
>
> It still not clear to me why it has to be written in assembler.
> Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?
>
> >
> > Thanks for your support,
> > Keesang
> >
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Tuesday, October 19, 2021 5:31 AM
> > To: Aman Kumar <aman.kumar@vvdntech.in>
> > Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> > shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> > matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang
> > <Keesang.Song@amd.com>; aman.kumar@vvdntech.in;
> > jerinjacobk@gmail.com; bruce.richardson@intel.com;
> > konstantin.ananyev@intel.com; david.marchand@redhat.com
> > Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> > routine to eal
> >
> > [CAUTION: External Email]
> >
> > 19/10/2021 12:47, Aman Kumar:
> > > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file
> > > with meson to build dpdk for AMD EPYC platforms.
> >
> > Please split in 2 patches: platform & memcpy.
> >
> > What optimization is specific to EPYC?
> >
> > I dislike the asm code below.
> > What is AMD specific inside?
> > Can it use compiler intrinsics as it is done elsewhere?
> >
> > > +static __rte_always_inline void *
> > > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > > +                                         const void *src,
> > > +                                         size_t size) {
> > > +     asm volatile goto("movq %0, %%rsi\n\t"
> > > +     "movq %1, %%rdi\n\t"
> > > +     "movq %2, %%rdx\n\t"
> > > +     "cmpq   $(128), %%rdx\n\t"
> > > +     "jb     202f\n\t"
> > > +     "201:\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > > +     "addq   $128, %%rsi\n\t"
> > > +     "addq   $128, %%rdi\n\t"
> > > +     "subq   $128, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > > +     "jae    201b\n\t"
> > > +     "202:\n\t"
> > > +     "cmpq   $64, %%rdx\n\t"
> > > +     "jb     203f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "addq   $64, %%rsi\n\t"
> > > +     "addq   $64, %%rdi\n\t"
> > > +     "subq   $64, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "203:\n\t"
> > > +     "cmpq   $32, %%rdx\n\t"
> > > +     "jb     204f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "addq   $32, %%rsi\n\t"
> > > +     "addq   $32, %%rdi\n\t"
> > > +     "subq   $32, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "204:\n\t"
> > > +     "cmpb   $16, %%dl\n\t"
> > > +     "jb     205f\n\t"
> > > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > > +     "addq   $16, %%rsi\n\t"
> > > +     "addq   $16, %%rdi\n\t"
> > > +     "subq   $16, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "205:\n\t"
> > > +     "cmpb   $2, %%dl\n\t"
> > > +     "jb     208f\n\t"
> > > +     "cmpb   $4, %%dl\n\t"
> > > +     "jbe    207f\n\t"
> > > +     "cmpb   $8, %%dl\n\t"
> > > +     "jbe    206f\n\t"
> > > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq   (%%rsi), %%rsi\n\t"
> > > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq   %%rsi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "206:\n\t"
> > > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl   (%%rsi), %%esi\n\t"
> > > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl   %%esi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "207:\n\t"
> > > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl (%%rsi), %%esi\n\t"
> > > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw   %%si, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "208:\n\t"
> > > +     "movzbl (%%rsi), %%ecx\n\t"
> > > +     "movb   %%cl, (%%rdi)"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(size)
> > > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> > > +
> > > +static __rte_always_inline void * rte_memcpy_generic(void *dst,
> > > +const void *src, size_t len) {
> > > +     asm goto("movq  %0, %%rsi\n\t"
> > > +     "movq   %1, %%rdi\n\t"
> > > +     "movq   %2, %%rdx\n\t"
> > > +     "movq    %%rdi, %%rax\n\t"
> > > +     "cmp     $32, %%rdx\n\t"
> > > +     "jb      101f\n\t"
> > > +     "cmp     $(32 * 2), %%rdx\n\t"
> > > +     "ja      108f\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "101:\n\t"
> > > +     /* Less than 1 VEC.  */
> > > +     "cmpb    $32, %%dl\n\t"
> > > +     "jae     103f\n\t"
> > > +     "cmpb    $16, %%dl\n\t"
> > > +     "jae     104f\n\t"
> > > +     "cmpb    $8, %%dl\n\t"
> > > +     "jae     105f\n\t"
> > > +     "cmpb    $4, %%dl\n\t"
> > > +     "jae     106f\n\t"
> > > +     "cmpb    $1, %%dl\n\t"
> > > +     "ja      107f\n\t"
> > > +     "jb      102f\n\t"
> > > +     "movzbl  (%%rsi), %%ecx\n\t"
> > > +     "movb    %%cl, (%%rdi)\n\t"
> > > +     "102:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "103:\n\t"
> > > +     /* From 32 to 63.  No branch when size == 32.  */
> > > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     /* From 16 to 31.  No branch when size == 16.  */
> > > +     "104:\n\t"
> > > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "105:\n\t"
> > > +     /* From 8 to 15.  No branch when size == 8.  */
> > > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq    (%%rsi), %%rsi\n\t"
> > > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq    %%rsi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "106:\n\t"
> > > +     /* From 4 to 7.  No branch when size == 4.  */
> > > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl    (%%rsi), %%esi\n\t"
> > > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl    %%esi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "107:\n\t"
> > > +     /* From 2 to 3.  No branch when size == 2.  */
> > > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl  (%%rsi), %%esi\n\t"
> > > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw    %%si, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "108:\n\t"
> > > +     /* More than 2 * VEC and there may be overlap between destination */
> > > +     /* and source.  */
> > > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > > +     "ja      111f\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "jb      109f\n\t"
> > > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "109:\n\t"
> > > +     /* Copy from 2 * VEC to 4 * VEC. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "110:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "111:\n\t"
> > > +     "cmpq    %%rsi, %%rdi\n\t"
> > > +     "ja      113f\n\t"
> > > +     /* Source == destination is less common.  */
> > > +     "je      110b\n\t"
> > > +     /* Load the first VEC and last 4 * VEC to
> > > +      * support overlapping addresses.
> > > +      */
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > > +     /* Save start and stop of the destination buffer.  */
> > > +     "movq    %%rdi, %%r11\n\t"
> > > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > > +     /* Align destination for aligned stores in the loop.  Compute */
> > > +     /* how much destination is misaligned.  */
> > > +     "movq    %%rdi, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Get the negative of offset for alignment.  */
> > > +     "subq    $32, %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rsi\n\t"
> > > +     /* Adjust destination which should be aligned now.  */
> > > +     "subq    %%r8, %%rdi\n\t"
> > > +     /* Adjust length.  */
> > > +     "addq    %%r8, %%rdx\n\t"
> > > +     /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      115f\n\t"
> > > +     "112:\n\t"
> > > +     /* Copy 4 * VEC a time forward.  */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32 * 4), %%rsi\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32 * 4), %%rdi\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      112b\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "113:\n\t"
> > > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > > +     /* Save stop of the destination buffer.  */
> > > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > > +     /* Align destination end for aligned stores in the loop.  Compute */
> > > +     /* how much destination end is misaligned.  */
> > > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > > +     "movq    %%r11, %%r9\n\t"
> > > +     "movq    %%r11, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rcx\n\t"
> > > +     /* Adjust the end of destination which should be aligned now.  */
> > > +     "subq    %%r8, %%r9\n\t"
> > > +     /* Adjust length.  */
> > > +     "subq    %%r8, %%rdx\n\t"
> > > +      /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      117f\n\t"
> > > +     "114:\n\t"
> > > +     /* Copy 4 * VEC a time backward.  */
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32 * 4), %%rcx\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      114b\n\t"
> > > +     /* Store the first 4 * VEC. */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC. */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +
> > > +     "115:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded. */
> > > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%rsi\n\t"
> > > +     "jb      112b\n\t"
> > > +     "116:\n\t"
> > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32*4), %%rsi\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32*4), %%rdi\n\t"
> > > +     "cmpq    $(32*4), %%rdx\n\t"
> > > +     "ja      116b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "117:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded.  */
> > > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%r9\n\t"
> > > +     "jb      114b\n\t"
> > > +     "118:\n\t"
> > > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32*4), %%rcx\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      118b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the first 4 * VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC.  */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(len)
> > > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> >
> >
>
>






^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-21 19:03             ` Song, Keesang
@ 2021-10-21 19:50               ` Thomas Monjalon
  0 siblings, 0 replies; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-21 19:50 UTC (permalink / raw)
  To: Aman Kumar, Song, Keesang
  Cc: Ananyev, Konstantin, dev, rasland, asafp, shys, viacheslavo,
	akozyrev, matan, Burakov, Anatoly, aman.kumar, jerinjacobk,
	Richardson, Bruce, david.marchand

21/10/2021 21:03, Song, Keesang:
> From: Thomas Monjalon <thomas@monjalon.net>
> > 21/10/2021 20:12, Song, Keesang:
> > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > 21/10/2021 19:10, Song, Keesang:
> > > > > 19/10/2021 17:35, Stephen Hemminger:
> > > > > > From: Thomas Monjalon <thomas@monjalon.net>
> > > > > > > 19/10/2021 12:47, Aman Kumar:
> > > > > > > > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > > > > > > > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file
> > > > > > > > with meson to build dpdk for AMD EPYC platforms.
> > > > > > > 
> > > > > > > Please split in 2 patches: platform & memcpy.
> > > > > > > 
> > > > > > > What optimization is specific to EPYC?
> > > > > > > 
> > > > > > > I dislike the asm code below.
> > > > > > > What is AMD specific inside?
> > > > > > > Can it use compiler intrinsics as it is done elsewhere?
> > > > > > 
> > > > > > And why is this not done by Gcc?
> > > > >
> > > > > I hope this can make some explanation to your question.
> > > > > We(AMD Linux library support team) have implemented the custom
> > > > > tailored memcpy solution which is a close match with DPDK use case
> > > > > requirements like the below.
> > > > > 1)      Min 64B length data packet with cache aligned
> > > > > Source and Destination.
> > > > > 2)      Non-Temporal load and temporal store for cache aligned
> > > > > source for both RX and TX paths.
> > > > > Could not implement the non-temporal store for TX_PATH,
> > > > > as non-Temporal load/stores works only with 32B aligned addresses
> > > > > for AVX2
> > > > > 3)      This solution works for all AVX2 supported AMD machines.
> > > > > 
> > > > > Internally we have completed the integrity testing and benchmarking
> > > > > of the solution and found gains of 8.4% to 14.5% specifically on
> > > > > Milan CPU(3rd Gen of EPYC Processor)
> > > > 
> > > > It still not clear to me why it has to be written in assembler.
> > > > Why similar stuff can't be written in C with instincts, as rest of
> > > > rte_memcpy.h does?
> > > 
> > > The current memcpy implementation in Glibc is based out of assembly
> > > coding.
> > > Although memcpy could have been implemented with intrinsic,
> > > but since our AMD library developers are working on the Glibc
> > > functions, they have provided a tailored implementation based
> > > out of inline assembly coding.
> > 
> > Please convert it to C code, thanks.
> 
> I've already asked our AMD tools team, but they're saying
> they are not really familiar with C code implementation.
> We need your approval for now since we really need to get
> this patch submitted to 21.11 LTS.

Not sure it is urgent given that v2 came after the planned -rc1 date,
after 6 weeks of silence.
About the approval, there are already 3 technical board members
(Konstantin, Stephen and me) objecting against this patch.
Not being familiar with C code when working on CPU optimization
in 2021 is a strange argument.

In general, I don't really understand why we should maintain memcpy
functions in DPDK instead of relying on libc optimizations.
Having big asm code to maintain and debug is not helping.

I think this case shows that AMD needs to become more familiar
with DPDK schedule and expectations.
I would encourage you to contribute more in the project,
so such misunderstanding won't happen in future.

Hope that's all understandable


PS: discussion is more readable with replies below



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
  2021-10-19 10:47   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 plaform Aman Kumar
  2021-10-19 12:31   ` [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal Thomas Monjalon
@ 2021-10-21 20:14   ` Thomas Monjalon
  2021-10-22  8:45     ` Bruce Richardson
  2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
  3 siblings, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-21 20:14 UTC (permalink / raw)
  To: keesang.song, Aman Kumar
  Cc: dev, rasland, asafp, shys, viacheslavo, akozyrev, matan,
	anatoly.burakov, jerinjacobk, bruce.richardson

19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for
> AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> as cross-file with meson to build dpdk for AMD EPYC platforms.
[...]
> --- a/config/x86/meson.build
> +++ b/config/x86/meson.build
> @@ -72,3 +72,10 @@ endif
>  dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
>  dpdk_conf.set('RTE_MAX_LCORE', 128)
>  dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
> +
> +if meson.is_cross_build()
> +	if meson.get_cross_property('platform') == 'amd-epyc'
> +	    dpdk_conf.set('RTE_MAX_LCORE', 512)
> +	    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
> +	endif
> +endif

Thinking again about the cross file.
Why not using the meson option "cpu_instruction_set"
to define RTE_MACHINE as "epyc" and tune other compilation options
without using artificial cross build?

Reminder, the default in config/meson.build is:
if cpu_instruction_set == 'generic'
    if host_machine.cpu_family().startswith('x86')
        cpu_instruction_set = 'corei7'

Cc Bruce who maintains this meson code.



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
  2021-10-21 20:14   ` Thomas Monjalon
@ 2021-10-22  8:45     ` Bruce Richardson
  0 siblings, 0 replies; 43+ messages in thread
From: Bruce Richardson @ 2021-10-22  8:45 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: keesang.song, Aman Kumar, dev, rasland, asafp, shys, viacheslavo,
	akozyrev, matan, anatoly.burakov, jerinjacobk

On Thu, Oct 21, 2021 at 10:14:47PM +0200, Thomas Monjalon wrote:
> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for
> > AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> > as cross-file with meson to build dpdk for AMD EPYC platforms.
> [...]
> > --- a/config/x86/meson.build
> > +++ b/config/x86/meson.build
> > @@ -72,3 +72,10 @@ endif
> >  dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
> >  dpdk_conf.set('RTE_MAX_LCORE', 128)
> >  dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
> > +
> > +if meson.is_cross_build()
> > +	if meson.get_cross_property('platform') == 'amd-epyc'
> > +	    dpdk_conf.set('RTE_MAX_LCORE', 512)
> > +	    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
> > +	endif
> > +endif
> 
> Thinking again about the cross file.
> Why not using the meson option "cpu_instruction_set"
> to define RTE_MACHINE as "epyc" and tune other compilation options
> without using artificial cross build?
> 
> Reminder, the default in config/meson.build is:
> if cpu_instruction_set == 'generic'
>     if host_machine.cpu_family().startswith('x86')
>         cpu_instruction_set = 'corei7'
> 
> Cc Bruce who maintains this meson code.
>
Yes, that is a good suggestion. You could detect a particular instruction
set value and set additional defines based on it.

/Bruce

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform
  2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
                     ` (2 preceding siblings ...)
  2021-10-21 20:14   ` Thomas Monjalon
@ 2021-10-26 15:56   ` Aman Kumar
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms Aman Kumar
                       ` (3 more replies)
  3 siblings, 4 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-26 15:56 UTC (permalink / raw)
  To: dev
  Cc: thomas, viacheslavo, anatoly.burakov, keesang.song, aman.kumar,
	jerinjacobk, konstantin.ananyev, bruce.richardson

-Dcpu_instruction_set=znverX meson option can be used
to build dpdk for AMD platform. Supported options are
znver1, znver2 and znver3.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/x86/meson.build | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/config/x86/meson.build b/config/x86/meson.build
index 29f3dea181..abb1bafb6e 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -72,3 +72,12 @@ endif
 dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
 dpdk_conf.set('RTE_MAX_LCORE', 128)
 dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
+
+# AMD platform support
+if get_option('cpu_instruction_set') == 'znver3'
+    dpdk_conf.set('RTE_MAX_LCORE', 512)
+elif get_option('cpu_instruction_set') == 'znver2'
+    dpdk_conf.set('RTE_MAX_LCORE', 512)
+elif get_option('cpu_instruction_set') == 'znver1'
+    dpdk_conf.set('RTE_MAX_LCORE', 256)
+endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms
  2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
@ 2021-10-26 15:56     ` Aman Kumar
  2021-10-26 16:07       ` Thomas Monjalon
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform Aman Kumar
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 43+ messages in thread
From: Aman Kumar @ 2021-10-26 15:56 UTC (permalink / raw)
  To: dev
  Cc: thomas, viacheslavo, anatoly.burakov, keesang.song, aman.kumar,
	jerinjacobk, konstantin.ananyev, bruce.richardson

linux guide updated with AMD platform related build
instructions.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 doc/guides/linux_gsg/build_dpdk.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/linux_gsg/build_dpdk.rst b/doc/guides/linux_gsg/build_dpdk.rst
index 0b08492ca2..3110448467 100644
--- a/doc/guides/linux_gsg/build_dpdk.rst
+++ b/doc/guides/linux_gsg/build_dpdk.rst
@@ -113,6 +113,10 @@ The instruction set will be set automatically by default according to these rule
 To override what instruction set will be used, set the ``cpu_instruction_set``
 parameter to the instruction set of your choice (such as ``corei7``, ``power8``, etc.).
 
+To build dpdk for AMD Zen CPU based platforms, pass ``-Dcpu_instruction_set=<znverX>``
+to meson when configuring the build folder initially. Supported options are znver1,
+znver2 and znver3.
+
 ``cpu_instruction_set`` is not used in Arm builds, as setting the instruction set
 without other parameters leads to inferior builds. The way to tailor Arm builds
 is to build for a SoC using ``-Dplatform=<SoC>`` mentioned above.
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform
  2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms Aman Kumar
@ 2021-10-26 15:56     ` Aman Kumar
  2021-10-26 16:14       ` Thomas Monjalon
  2021-10-26 21:10       ` Stephen Hemminger
  2021-10-26 16:01     ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for " Thomas Monjalon
  2021-10-27  7:28     ` [dpdk-dev] [PATCH v4 1/2] " Aman Kumar
  3 siblings, 2 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-26 15:56 UTC (permalink / raw)
  To: dev
  Cc: thomas, viacheslavo, anatoly.burakov, keesang.song, aman.kumar,
	jerinjacobk, konstantin.ananyev, bruce.richardson

This patch provides a rte_memcpy* call with temporal stores.
Use -Dcpu_instruction_set=znverX with build to enable this API.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/x86/meson.build           |   2 +
 lib/eal/x86/include/rte_memcpy.h | 114 +++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/config/x86/meson.build b/config/x86/meson.build
index abb1bafb6e..7ef7f32cd4 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -76,8 +76,10 @@ dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
 # AMD platform support
 if get_option('cpu_instruction_set') == 'znver3'
     dpdk_conf.set('RTE_MAX_LCORE', 512)
+    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
 elif get_option('cpu_instruction_set') == 'znver2'
     dpdk_conf.set('RTE_MAX_LCORE', 512)
+    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
 elif get_option('cpu_instruction_set') == 'znver1'
     dpdk_conf.set('RTE_MAX_LCORE', 256)
 endif
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..8fe7822cb4 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -376,6 +376,120 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+#if defined RTE_MEMCPY_AMDEPYC
+
+/**
+ * Copy 16 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy16_ts(uint8_t *dst, uint8_t *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy32_ts(uint8_t *dst, uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy64_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy128_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+	rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
+	rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
+}
+
+/**
+ * Copy len bytes from one location to another,
+ * with temporal stores 16B aligned
+ */
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
+{
+	return rte_memcpy_aligned_tstore16_generic(dst, src, len);
+}
+
+#endif /* RTE_MEMCPY_AMDEPYC */
+
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform
  2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms Aman Kumar
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform Aman Kumar
@ 2021-10-26 16:01     ` Thomas Monjalon
  2021-10-27  6:26       ` Aman Kumar
  2021-10-27  7:28     ` [dpdk-dev] [PATCH v4 1/2] " Aman Kumar
  3 siblings, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-26 16:01 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, viacheslavo, anatoly.burakov, keesang.song, jerinjacobk,
	konstantin.ananyev, bruce.richardson

26/10/2021 17:56, Aman Kumar:
> -Dcpu_instruction_set=znverX meson option can be used
> to build dpdk for AMD platform. Supported options are
> znver1, znver2 and znver3.

OK that approach looks good.

> +# AMD platform support
> +if get_option('cpu_instruction_set') == 'znver3'
> +    dpdk_conf.set('RTE_MAX_LCORE', 512)
> +elif get_option('cpu_instruction_set') == 'znver2'
> +    dpdk_conf.set('RTE_MAX_LCORE', 512)
> +elif get_option('cpu_instruction_set') == 'znver1'
> +    dpdk_conf.set('RTE_MAX_LCORE', 256)
> +endif

Maybe sort it in the reverse order, starting with 1?



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms Aman Kumar
@ 2021-10-26 16:07       ` Thomas Monjalon
  2021-10-27  6:30         ` Aman Kumar
  0 siblings, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-26 16:07 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, viacheslavo, anatoly.burakov, keesang.song, jerinjacobk,
	konstantin.ananyev, bruce.richardson

26/10/2021 17:56, Aman Kumar:
> linux guide updated with AMD platform related build
> instructions.
> 
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> ---
>  doc/guides/linux_gsg/build_dpdk.rst | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/doc/guides/linux_gsg/build_dpdk.rst b/doc/guides/linux_gsg/build_dpdk.rst
> index 0b08492ca2..3110448467 100644
> --- a/doc/guides/linux_gsg/build_dpdk.rst
> +++ b/doc/guides/linux_gsg/build_dpdk.rst
> @@ -113,6 +113,10 @@ The instruction set will be set automatically by default according to these rule
>  To override what instruction set will be used, set the ``cpu_instruction_set``
>  parameter to the instruction set of your choice (such as ``corei7``, ``power8``, etc.).

All possible values are not documented here.
If you want to mention znverX, I suggest just adding znver3 after corei7 above,
and remove the lines below.
Note: such change should be squashed with the meson change.

> +To build dpdk for AMD Zen CPU based platforms, pass ``-Dcpu_instruction_set=<znverX>``

s/dpdk/DPDK/

> +to meson when configuring the build folder initially. Supported options are znver1,
> +znver2 and znver3.

You can start a new line when starting a new sentence.

If you want to document AMD platform, you can add a file in
	doc/guides/platform/
For describing tuning on Linux you can do the same as
	doc/guides/linux_gsg/nic_perf_intel_platform.rst



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform Aman Kumar
@ 2021-10-26 16:14       ` Thomas Monjalon
  2021-10-27  6:34         ` Aman Kumar
  2021-10-26 21:10       ` Stephen Hemminger
  1 sibling, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-26 16:14 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, viacheslavo, anatoly.burakov, keesang.song, jerinjacobk,
	konstantin.ananyev, bruce.richardson

26/10/2021 17:56, Aman Kumar:
> This patch provides a rte_memcpy* call with temporal stores.
> Use -Dcpu_instruction_set=znverX with build to enable this API.
> 
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> ---
>  config/x86/meson.build           |   2 +
>  lib/eal/x86/include/rte_memcpy.h | 114 +++++++++++++++++++++++++++++++

It looks better as C code.
Do you achieve the same performance as the asm version?

> +#if defined RTE_MEMCPY_AMDEPYC
[...]
> +static __rte_always_inline void *
> +rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)

So to be clear, an application will benefit of this optimization if
1/ DPDK is specifically compiled for AMD
2/ the application is compiled with above DPDK build (because of inlinining)

I guess there is no good way to benefit from the optimization
without specific compilation, because of inlining constraint.
Another design, with less constraint but less performance,
would be to have a function pointer assigned at runtime based on the CPU.



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform
  2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform Aman Kumar
  2021-10-26 16:14       ` Thomas Monjalon
@ 2021-10-26 21:10       ` Stephen Hemminger
  2021-10-27  6:43         ` Aman Kumar
  1 sibling, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2021-10-26 21:10 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, thomas, viacheslavo, anatoly.burakov, keesang.song,
	jerinjacobk, konstantin.ananyev, bruce.richardson

On Tue, 26 Oct 2021 21:26:45 +0530
Aman Kumar <aman.kumar@vvdntech.in> wrote:

> This patch provides a rte_memcpy* call with temporal stores.
> Use -Dcpu_instruction_set=znverX with build to enable this API.
> 
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>

Ok, but would be better to get it into glibc.
Would benefit wider array of platforms and get more testing.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform
  2021-10-26 16:01     ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for " Thomas Monjalon
@ 2021-10-27  6:26       ` Aman Kumar
  0 siblings, 0 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-27  6:26 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dpdk-dev, Slava Ovsiienko, Anatoly Burakov, Song, Keesang,
	Jerin Jacob, konstantin.ananyev, bruce.richardson

On Tue, Oct 26, 2021 at 9:31 PM Thomas Monjalon <thomas@monjalon.net> wrote:

> 26/10/2021 17:56, Aman Kumar:
> > -Dcpu_instruction_set=znverX meson option can be used
> > to build dpdk for AMD platform. Supported options are
> > znver1, znver2 and znver3.
>
> OK that approach looks good.
>
> > +# AMD platform support
> > +if get_option('cpu_instruction_set') == 'znver3'
> > +    dpdk_conf.set('RTE_MAX_LCORE', 512)
> > +elif get_option('cpu_instruction_set') == 'znver2'
> > +    dpdk_conf.set('RTE_MAX_LCORE', 512)
> > +elif get_option('cpu_instruction_set') == 'znver1'
> > +    dpdk_conf.set('RTE_MAX_LCORE', 256)
> > +endif
>
> Maybe sort it in the reverse order, starting with 1?
>
> OK. Will update this.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms
  2021-10-26 16:07       ` Thomas Monjalon
@ 2021-10-27  6:30         ` Aman Kumar
  0 siblings, 0 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-27  6:30 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dpdk-dev, Slava Ovsiienko, Anatoly Burakov, Song, Keesang,
	Jerin Jacob, konstantin.ananyev, bruce.richardson

On Tue, Oct 26, 2021 at 9:37 PM Thomas Monjalon <thomas@monjalon.net> wrote:

> 26/10/2021 17:56, Aman Kumar:
> > linux guide updated with AMD platform related build
> > instructions.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> > ---
> >  doc/guides/linux_gsg/build_dpdk.rst | 4 ++++
> >  1 file changed, 4 insertions(+)
> >
> > diff --git a/doc/guides/linux_gsg/build_dpdk.rst
> b/doc/guides/linux_gsg/build_dpdk.rst
> > index 0b08492ca2..3110448467 100644
> > --- a/doc/guides/linux_gsg/build_dpdk.rst
> > +++ b/doc/guides/linux_gsg/build_dpdk.rst
> > @@ -113,6 +113,10 @@ The instruction set will be set automatically by
> default according to these rule
> >  To override what instruction set will be used, set the
> ``cpu_instruction_set``
> >  parameter to the instruction set of your choice (such as ``corei7``,
> ``power8``, etc.).
>
> All possible values are not documented here.
> If you want to mention znverX, I suggest just adding znver3 after corei7
> above,
> and remove the lines below.
> Note: such change should be squashed with the meson change.
>

Noted. Will squash this with meson changes.

>
> > +To build dpdk for AMD Zen CPU based platforms, pass
> ``-Dcpu_instruction_set=<znverX>``
>
> s/dpdk/DPDK/
>
> > +to meson when configuring the build folder initially. Supported options
> are znver1,
> > +znver2 and znver3.
>
> You can start a new line when starting a new sentence.
>
> If you want to document AMD platform, you can add a file in
>         doc/guides/platform/
> For describing tuning on Linux you can do the same as
>         doc/guides/linux_gsg/nic_perf_intel_platform.rst
>
> For now, the above entry in the build_dpdk.rst will suffice.
We will add a separate platform doc once we have more platform specific
items.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform
  2021-10-26 16:14       ` Thomas Monjalon
@ 2021-10-27  6:34         ` Aman Kumar
  2021-10-27  7:59           ` Thomas Monjalon
  0 siblings, 1 reply; 43+ messages in thread
From: Aman Kumar @ 2021-10-27  6:34 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dpdk-dev, Slava Ovsiienko, Anatoly Burakov, Song, Keesang,
	Jerin Jacob, konstantin.ananyev, bruce.richardson

On Tue, Oct 26, 2021 at 9:44 PM Thomas Monjalon <thomas@monjalon.net> wrote:

> 26/10/2021 17:56, Aman Kumar:
> > This patch provides a rte_memcpy* call with temporal stores.
> > Use -Dcpu_instruction_set=znverX with build to enable this API.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> > ---
> >  config/x86/meson.build           |   2 +
> >  lib/eal/x86/include/rte_memcpy.h | 114 +++++++++++++++++++++++++++++++
>
> It looks better as C code.
> Do you achieve the same performance as the asm version?
>

In a few corner cases assembly performed better, but overall we have very
similar perf observations.

> > +#if defined RTE_MEMCPY_AMDEPYC
> [...]
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
>
> So to be clear, an application will benefit of this optimization if
> 1/ DPDK is specifically compiled for AMD
> 2/ the application is compiled with above DPDK build (because of
> inlinining)
>
> I guess there is no good way to benefit from the optimization
> without specific compilation, because of inlining constraint.
> Another design, with less constraint but less performance,
> would be to have a function pointer assigned at runtime based on the CPU.
>

You're right. We need to build DPDK and apps with this flag enabled to get
the benefit.
In future versions, we will try to adapt in a more dynamic way. Thanks.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform
  2021-10-26 21:10       ` Stephen Hemminger
@ 2021-10-27  6:43         ` Aman Kumar
  0 siblings, 0 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-27  6:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dpdk-dev, Thomas Monjalon, Slava Ovsiienko, Anatoly Burakov,
	Song, Keesang, Jerin Jacob, konstantin.ananyev, bruce.richardson

On Wed, Oct 27, 2021 at 2:41 AM Stephen Hemminger <
stephen@networkplumber.org> wrote:

> On Tue, 26 Oct 2021 21:26:45 +0530
> Aman Kumar <aman.kumar@vvdntech.in> wrote:
>
> > This patch provides a rte_memcpy* call with temporal stores.
> > Use -Dcpu_instruction_set=znverX with build to enable this API.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
>
> Ok, but would be better to get it into glibc.
> Would benefit wider array of platforms and get more testing.
>

Yes, we've considered this. This may go into glibc in future.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] config/x86: add support for AMD platform
  2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
                       ` (2 preceding siblings ...)
  2021-10-26 16:01     ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for " Thomas Monjalon
@ 2021-10-27  7:28     ` Aman Kumar
  2021-10-27  7:28       ` [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy " Aman Kumar
  3 siblings, 1 reply; 43+ messages in thread
From: Aman Kumar @ 2021-10-27  7:28 UTC (permalink / raw)
  To: dev
  Cc: thomas, viacheslavo, anatoly.burakov, keesang.song, aman.kumar,
	jerinjacobk, konstantin.ananyev, bruce.richardson

-Dcpu_instruction_set=znverX meson option can be used
to build dpdk for AMD platforms. Supported options are
znver1, znver2 and znver3.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/x86/meson.build              | 9 +++++++++
 doc/guides/linux_gsg/build_dpdk.rst | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/config/x86/meson.build b/config/x86/meson.build
index 29f3dea181..21cda6fd33 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -72,3 +72,12 @@ endif
 dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
 dpdk_conf.set('RTE_MAX_LCORE', 128)
 dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
+
+# AMD platform support
+if get_option('cpu_instruction_set') == 'znver1'
+    dpdk_conf.set('RTE_MAX_LCORE', 256)
+elif get_option('cpu_instruction_set') == 'znver2'
+    dpdk_conf.set('RTE_MAX_LCORE', 512)
+elif get_option('cpu_instruction_set') == 'znver3'
+    dpdk_conf.set('RTE_MAX_LCORE', 512)
+endif
diff --git a/doc/guides/linux_gsg/build_dpdk.rst b/doc/guides/linux_gsg/build_dpdk.rst
index 0b08492ca2..e224a06cbd 100644
--- a/doc/guides/linux_gsg/build_dpdk.rst
+++ b/doc/guides/linux_gsg/build_dpdk.rst
@@ -111,7 +111,7 @@ The instruction set will be set automatically by default according to these rule
   a common minimal baseline needed for DPDK.
 
 To override what instruction set will be used, set the ``cpu_instruction_set``
-parameter to the instruction set of your choice (such as ``corei7``, ``power8``, etc.).
+parameter to the instruction set of your choice (such as ``corei7``, ``power8``, ``znver3``, etc.).
 
 ``cpu_instruction_set`` is not used in Arm builds, as setting the instruction set
 without other parameters leads to inferior builds. The way to tailor Arm builds
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27  7:28     ` [dpdk-dev] [PATCH v4 1/2] " Aman Kumar
@ 2021-10-27  7:28       ` Aman Kumar
  2021-10-27  8:13         ` Thomas Monjalon
  2021-10-27 11:33         ` Mattias Rönnblom
  0 siblings, 2 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-27  7:28 UTC (permalink / raw)
  To: dev
  Cc: thomas, viacheslavo, anatoly.burakov, keesang.song, aman.kumar,
	jerinjacobk, konstantin.ananyev, bruce.richardson

This patch provides a rte_memcpy* call with temporal stores.
Use -Dcpu_instruction_set=znverX with build to enable this API.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/x86/meson.build           |   2 +
 lib/eal/x86/include/rte_memcpy.h | 114 +++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/config/x86/meson.build b/config/x86/meson.build
index 21cda6fd33..56dae4aca7 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -78,6 +78,8 @@ if get_option('cpu_instruction_set') == 'znver1'
     dpdk_conf.set('RTE_MAX_LCORE', 256)
 elif get_option('cpu_instruction_set') == 'znver2'
     dpdk_conf.set('RTE_MAX_LCORE', 512)
+    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
 elif get_option('cpu_instruction_set') == 'znver3'
     dpdk_conf.set('RTE_MAX_LCORE', 512)
+    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
 endif
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..8fe7822cb4 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -376,6 +376,120 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+#if defined RTE_MEMCPY_AMDEPYC
+
+/**
+ * Copy 16 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy16_ts(uint8_t *dst, uint8_t *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy32_ts(uint8_t *dst, uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy64_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy128_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+	rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
+	rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
+}
+
+/**
+ * Copy len bytes from one location to another,
+ * with temporal stores 16B aligned
+ */
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
+{
+	return rte_memcpy_aligned_tstore16_generic(dst, src, len);
+}
+
+#endif /* RTE_MEMCPY_AMDEPYC */
+
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform
  2021-10-27  6:34         ` Aman Kumar
@ 2021-10-27  7:59           ` Thomas Monjalon
  0 siblings, 0 replies; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-27  7:59 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dpdk-dev, Slava Ovsiienko, Anatoly Burakov, Song, Keesang,
	Jerin Jacob, konstantin.ananyev, bruce.richardson

27/10/2021 08:34, Aman Kumar:
> On Tue, Oct 26, 2021 at 9:44 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> 
> > 26/10/2021 17:56, Aman Kumar:
> > > This patch provides a rte_memcpy* call with temporal stores.
> > > Use -Dcpu_instruction_set=znverX with build to enable this API.
> > >
> > > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> > > ---
> > >  config/x86/meson.build           |   2 +
> > >  lib/eal/x86/include/rte_memcpy.h | 114 +++++++++++++++++++++++++++++++
> >
> > It looks better as C code.
> > Do you achieve the same performance as the asm version?
> >
> 
> In a few corner cases assembly performed better, but overall we have very
> similar perf observations.
> 
> > > +#if defined RTE_MEMCPY_AMDEPYC
> > [...]
> > > +static __rte_always_inline void *
> > > +rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
> >
> > So to be clear, an application will benefit of this optimization if
> > 1/ DPDK is specifically compiled for AMD
> > 2/ the application is compiled with above DPDK build (because of
> > inlinining)
> >
> > I guess there is no good way to benefit from the optimization
> > without specific compilation, because of inlining constraint.
> > Another design, with less constraint but less performance,
> > would be to have a function pointer assigned at runtime based on the CPU.
> >
> 
> You're right. We need to build DPDK and apps with this flag enabled to get
> the benefit.

So the x86 packages, as in Linux distributions, won't have this optimization.

> In future versions, we will try to adapt in a more dynamic way. Thanks.

No, I was trying to say that unfortunately there is probably no solution.






^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27  7:28       ` [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy " Aman Kumar
@ 2021-10-27  8:13         ` Thomas Monjalon
  2021-10-27 11:03           ` Van Haaren, Harry
  2021-10-27 11:33         ` Mattias Rönnblom
  1 sibling, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-27  8:13 UTC (permalink / raw)
  To: Aman Kumar
  Cc: dev, viacheslavo, anatoly.burakov, keesang.song, aman.kumar,
	jerinjacobk, konstantin.ananyev, bruce.richardson,
	honnappa.nagarahalli, Ruifeng Wang, David Christensen,
	david.marchand, stephen

27/10/2021 09:28, Aman Kumar:
> This patch provides a rte_memcpy* call with temporal stores.
> Use -Dcpu_instruction_set=znverX with build to enable this API.
> 
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>

For the series, Acked-by: Thomas Monjalon <thomas@monjalon.net>
With the hope that such optimization will go in libc in a near future.

If there is no objection, I will merge this AMD-specific series in 21.11-rc2.
It should not affect other platforms.



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27  8:13         ` Thomas Monjalon
@ 2021-10-27 11:03           ` Van Haaren, Harry
  2021-10-27 11:41             ` Mattias Rönnblom
  0 siblings, 1 reply; 43+ messages in thread
From: Van Haaren, Harry @ 2021-10-27 11:03 UTC (permalink / raw)
  To: Thomas Monjalon, Aman Kumar
  Cc: dev, viacheslavo, Burakov, Anatoly, keesang.song, aman.kumar,
	jerinjacobk, Ananyev, Konstantin, Richardson, Bruce,
	honnappa.nagarahalli, Ruifeng Wang, David Christensen,
	david.marchand, stephen

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Thomas Monjalon
> Sent: Wednesday, October 27, 2021 9:13 AM
> To: Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; viacheslavo@nvidia.com; Burakov, Anatoly
> <anatoly.burakov@intel.com>; keesang.song@amd.com;
> aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; honnappa.nagarahalli@arm.com; Ruifeng Wang
> <ruifeng.wang@arm.com>; David Christensen <drc@linux.vnet.ibm.com>;
> david.marchand@redhat.com; stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy
> support for AMD platform
> 
> 27/10/2021 09:28, Aman Kumar:
> > This patch provides a rte_memcpy* call with temporal stores.
> > Use -Dcpu_instruction_set=znverX with build to enable this API.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> 
> For the series, Acked-by: Thomas Monjalon <thomas@monjalon.net>
> With the hope that such optimization will go in libc in a near future.
> 
> If there is no objection, I will merge this AMD-specific series in 21.11-rc2.
> It should not affect other platforms.

Hi Folks,

This patchset was brought to my attention, and I have a few concerns.
I'll add short snippets of context from the patch here so I can refer to it below;

+/**
+ * Copy 16 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy16_ts(uint8_t *dst, uint8_t *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}

1) What is fundamentally specific to the znverX CPU? Is there any reason this can not just be enabled for x86-64 generic with SSE4.1 ISA requirements?
_mm_stream_load_si128() is part of SSE4.1
_mm_storeu_si128() is SSE2. 
Using the intrinsics guide for lookup of intrinsics to ISA level: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html?wapkw=intrinsics%20guide#text=_mm_stream_load&ig_expand=6884

2) Are -D options allowed to change/break API/ABI?
By allowing -Dcpu_instruction_set= to change available functions, any application using it is no longer source-code (API) compatible with "DPDK" proper.
This patch essentially splits a "DPDK" app to depend on "DPDK + CPU version -D flag", in an incompatible way (no fallback?).

3) The stream load instruction used here *requires* 16-byte alignment for its operand.
This is not documented, and worse, a uint8_t* is accepted, which is cast to (__m128i *).
This cast hides the compiler warning for expanding type-alignments.
And the code itself is broken - passing a "src" parameter that is not 16-byte aligned will segfault.

4) Temporal and Non-temporal are not logically presented here.
Temporal loads/stores are normal loads/stores. They use the L1/L2 caches.
Non-temporal loads/stores indicate that the data will *not* be used again in a short space of time.
Non-temporal means "having no relation to time" according to my internet search.

5) The *store* here uses a normal store (temporal, targets cache). The *load* however is a streaming (non-temporal, no cache) load.
It is not clearly documented that A) stream load will be used.
The inverse is documented "copy with ts" aka, copy with temporal store.
Is documenting the store as temporal meant to imply that the load is non-temporal?

6) What is the use-case for this? When would a user *want* to use this instead of rte_memcpy()?
If the data being loaded is relevant to datapath/packets, presumably other packets might require the
loaded data, so temporal (normal) loads should be used to cache the source data?

7) Why is streaming (non-temporal) loads & stores not used? I guess maybe this is regarding the use-case,
but its not clear to me right now why loads are NT, and stores are T.

All in all, I do not think merging this patch is a good idea. I would like to understand the motivation for adding
this type of function, and then see it being done in a way that is clearly documented regarding temporal loads/stores,
and not changing/adding APIs for specific CPUs.

So apologies for late feedback, but this is not of high enough quality to be merged to DPDK right now, NACK.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27  7:28       ` [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy " Aman Kumar
  2021-10-27  8:13         ` Thomas Monjalon
@ 2021-10-27 11:33         ` Mattias Rönnblom
  1 sibling, 0 replies; 43+ messages in thread
From: Mattias Rönnblom @ 2021-10-27 11:33 UTC (permalink / raw)
  To: Aman Kumar, dev
  Cc: thomas, viacheslavo, anatoly.burakov, Song, Keesang, jerinjacobk,
	konstantin.ananyev, bruce.richardson

On 2021-10-27 09:28, Aman Kumar wrote:
> This patch provides a rte_memcpy* call with temporal stores.
> Use -Dcpu_instruction_set=znverX with build to enable this API.
>
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> ---
>   config/x86/meson.build           |   2 +
>   lib/eal/x86/include/rte_memcpy.h | 114 +++++++++++++++++++++++++++++++
>   2 files changed, 116 insertions(+)
>
> diff --git a/config/x86/meson.build b/config/x86/meson.build
> index 21cda6fd33..56dae4aca7 100644
> --- a/config/x86/meson.build
> +++ b/config/x86/meson.build
> @@ -78,6 +78,8 @@ if get_option('cpu_instruction_set') == 'znver1'
>       dpdk_conf.set('RTE_MAX_LCORE', 256)
>   elif get_option('cpu_instruction_set') == 'znver2'
>       dpdk_conf.set('RTE_MAX_LCORE', 512)
> +    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
>   elif get_option('cpu_instruction_set') == 'znver3'
>       dpdk_conf.set('RTE_MAX_LCORE', 512)
> +    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
>   endif
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..8fe7822cb4 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -376,6 +376,120 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>   	}
>   }
>   
> +#if defined RTE_MEMCPY_AMDEPYC
> +
> +/**
> + * Copy 16 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy16_ts(uint8_t *dst, uint8_t *src)
> +{
> +	__m128i var128;
> +
> +	var128 = _mm_stream_load_si128((__m128i *)src);
> +	_mm_storeu_si128((__m128i *)dst, var128);
> +}
> +
> +/**
> + * Copy 32 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy32_ts(uint8_t *dst, uint8_t *src)
> +{
> +	__m256i ymm0;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +}
> +
> +/**
> + * Copy 64 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy64_ts(uint8_t *dst, uint8_t *src)
> +{
> +	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
> +	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
> +}
> +
> +/**
> + * Copy 128 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy128_ts(uint8_t *dst, uint8_t *src)
> +{
> +	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
> +	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
> +	rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
> +	rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
> +}
> +
> +/**
> + * Copy len bytes from one location to another,
> + * with temporal stores 16B aligned
> + */
> +static __rte_always_inline void *
> +rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
> +{
> +	void *dest = dst;
> +
> +	while (len >= 128) {
> +		rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
> +		dst = (uint8_t *)dst + 128;
> +		src = (uint8_t *)src + 128;
> +		len -= 128;
> +	}
> +	while (len >= 64) {
> +		rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
> +		dst = (uint8_t *)dst + 64;
> +		src = (uint8_t *)src + 64;
> +		len -= 64;
> +	}
> +	while (len >= 32) {
> +		rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
> +		dst = (uint8_t *)dst + 32;
> +		src = (uint8_t *)src + 32;
> +		len -= 32;
> +	}
> +	if (len >= 16) {
> +		rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
> +		dst = (uint8_t *)dst + 16;
> +		src = (uint8_t *)src + 16;
> +		len -= 16;
> +	}
> +	if (len >= 8) {
> +		*(uint64_t *)dst = *(const uint64_t *)src;
> +		dst = (uint8_t *)dst + 8;
> +		src = (uint8_t *)src + 8;
> +		len -= 8;
> +	}
> +	if (len >= 4) {
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +		dst = (uint8_t *)dst + 4;
> +		src = (uint8_t *)src + 4;
> +		len -= 4;
> +	}
> +	if (len != 0) {
> +		dst = (uint8_t *)dst - (4 - len);
> +		src = (uint8_t *)src - (4 - len);
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +	}
> +
> +	return dest;


You don't need a _mm_sfence after the NT stores to avoid surprises (e.g, 
if you use this NT memcpy() in combination with DPDK rings)? NT stores 
are weakly ordered on x86_64, from what I understand.


> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_aligned_tstore16(void *dst, void *src, int len)


Shouldn't both dst and src be marked __restrict? Goes for all these 
functions.

> +{
> +	return rte_memcpy_aligned_tstore16_generic(dst, src, len);
> +}
> +
> +#endif /* RTE_MEMCPY_AMDEPYC */


What does x86_64 NT stores have to do with EPYC?


> +
>   static __rte_always_inline void *
>   rte_memcpy_generic(void *dst, const void *src, size_t n)
>   {



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 11:03           ` Van Haaren, Harry
@ 2021-10-27 11:41             ` Mattias Rönnblom
  2021-10-27 12:15               ` Van Haaren, Harry
  0 siblings, 1 reply; 43+ messages in thread
From: Mattias Rönnblom @ 2021-10-27 11:41 UTC (permalink / raw)
  To: Van Haaren, Harry, Thomas Monjalon, Aman Kumar
  Cc: dev, viacheslavo, Burakov, Anatoly, Song, Keesang, jerinjacobk,
	Ananyev, Konstantin, Richardson, Bruce, honnappa.nagarahalli,
	Ruifeng Wang, David Christensen, david.marchand, stephen

On 2021-10-27 13:03, Van Haaren, Harry wrote:
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Thomas Monjalon
>> Sent: Wednesday, October 27, 2021 9:13 AM
>> To: Aman Kumar <aman.kumar@vvdntech.in>
>> Cc: dev@dpdk.org; viacheslavo@nvidia.com; Burakov, Anatoly
>> <anatoly.burakov@intel.com>; keesang.song@amd.com;
>> aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Ananyev, Konstantin
>> <konstantin.ananyev@intel.com>; Richardson, Bruce
>> <bruce.richardson@intel.com>; honnappa.nagarahalli@arm.com; Ruifeng Wang
>> <ruifeng.wang@arm.com>; David Christensen <drc@linux.vnet.ibm.com>;
>> david.marchand@redhat.com; stephen@networkplumber.org
>> Subject: Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy
>> support for AMD platform
>>
>> 27/10/2021 09:28, Aman Kumar:
>>> This patch provides a rte_memcpy* call with temporal stores.
>>> Use -Dcpu_instruction_set=znverX with build to enable this API.
>>>
>>> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
>> For the series, Acked-by: Thomas Monjalon <thomas@monjalon.net>
>> With the hope that such optimization will go in libc in a near future.
>>
>> If there is no objection, I will merge this AMD-specific series in 21.11-rc2.
>> It should not affect other platforms.
> Hi Folks,
>
> This patchset was brought to my attention, and I have a few concerns.
> I'll add short snippets of context from the patch here so I can refer to it below;
>
> +/**
> + * Copy 16 bytes from one location to another,
> + * with temporal stores
> + */
> +static __rte_always_inline void
> +rte_copy16_ts(uint8_t *dst, uint8_t *src)
> +{
> +	__m128i var128;
> +
> +	var128 = _mm_stream_load_si128((__m128i *)src);
> +	_mm_storeu_si128((__m128i *)dst, var128);
> +}
>
> 1) What is fundamentally specific to the znverX CPU? Is there any reason this can not just be enabled for x86-64 generic with SSE4.1 ISA requirements?
> _mm_stream_load_si128() is part of SSE4.1
> _mm_storeu_si128() is SSE2.
> Using the intrinsics guide for lookup of intrinsics to ISA level: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html?wapkw=intrinsics%20guide#text=_mm_stream_load&ig_expand=6884
>
> 2) Are -D options allowed to change/break API/ABI?
> By allowing -Dcpu_instruction_set= to change available functions, any application using it is no longer source-code (API) compatible with "DPDK" proper.
> This patch essentially splits a "DPDK" app to depend on "DPDK + CPU version -D flag", in an incompatible way (no fallback?).
>
> 3) The stream load instruction used here *requires* 16-byte alignment for its operand.
> This is not documented, and worse, a uint8_t* is accepted, which is cast to (__m128i *).
> This cast hides the compiler warning for expanding type-alignments.
> And the code itself is broken - passing a "src" parameter that is not 16-byte aligned will segfault.
>
> 4) Temporal and Non-temporal are not logically presented here.
> Temporal loads/stores are normal loads/stores. They use the L1/L2 caches.
> Non-temporal loads/stores indicate that the data will *not* be used again in a short space of time.
> Non-temporal means "having no relation to time" according to my internet search.
>
> 5) The *store* here uses a normal store (temporal, targets cache). The *load* however is a streaming (non-temporal, no cache) load.
> It is not clearly documented that A) stream load will be used.
> The inverse is documented "copy with ts" aka, copy with temporal store.
> Is documenting the store as temporal meant to imply that the load is non-temporal?
>
> 6) What is the use-case for this? When would a user *want* to use this instead of rte_memcpy()?
> If the data being loaded is relevant to datapath/packets, presumably other packets might require the
> loaded data, so temporal (normal) loads should be used to cache the source data?


I'm not sure if your first question is rhetorical or not, but a memcpy() 
in a NT variant is certainly useful. One use case for a memcpy() with 
temporal loads and non-temporal stores is if you need to archive packet 
payload for (distant, potential) future use, and want to avoid causing 
unnecessary LLC evictions while doing so.


> 7) Why is streaming (non-temporal) loads & stores not used? I guess maybe this is regarding the use-case,
> but its not clear to me right now why loads are NT, and stores are T.
>
> All in all, I do not think merging this patch is a good idea. I would like to understand the motivation for adding
> this type of function, and then see it being done in a way that is clearly documented regarding temporal loads/stores,
> and not changing/adding APIs for specific CPUs.
>
> So apologies for late feedback, but this is not of high enough quality to be merged to DPDK right now, NACK.



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 11:41             ` Mattias Rönnblom
@ 2021-10-27 12:15               ` Van Haaren, Harry
  2021-10-27 12:22                 ` Ananyev, Konstantin
  0 siblings, 1 reply; 43+ messages in thread
From: Van Haaren, Harry @ 2021-10-27 12:15 UTC (permalink / raw)
  To: mattias.ronnblom, Thomas Monjalon, Aman Kumar
  Cc: dev, viacheslavo, Burakov, Anatoly, Song, Keesang, jerinjacobk,
	Ananyev, Konstantin, Richardson, Bruce, honnappa.nagarahalli,
	Ruifeng Wang, David Christensen, david.marchand, stephen

> -----Original Message-----
> From: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Sent: Wednesday, October 27, 2021 12:42 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; viacheslavo@nvidia.com; Burakov, Anatoly
> <anatoly.burakov@intel.com>; Song, Keesang <Keesang.Song@amd.com>;
> jerinjacobk@gmail.com; Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>;
> honnappa.nagarahalli@arm.com; Ruifeng Wang <ruifeng.wang@arm.com>;
> David Christensen <drc@linux.vnet.ibm.com>; david.marchand@redhat.com;
> stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy
> support for AMD platform
> 
> On 2021-10-27 13:03, Van Haaren, Harry wrote:
> >> -----Original Message-----

<snip>

Hi Mattias,

> > 6) What is the use-case for this? When would a user *want* to use this instead
> of rte_memcpy()?
> > If the data being loaded is relevant to datapath/packets, presumably other
> packets might require the
> > loaded data, so temporal (normal) loads should be used to cache the source
> data?
> 
> 
> I'm not sure if your first question is rhetorical or not, but a memcpy()
> in a NT variant is certainly useful. One use case for a memcpy() with
> temporal loads and non-temporal stores is if you need to archive packet
> payload for (distant, potential) future use, and want to avoid causing
> unnecessary LLC evictions while doing so.

Yes I agree that there are certainly benefits in using cache-locality hints.
There is an open question around if the src or dst or both are non-temporal.

In the implementation of this patch, the NT/T type of store is reversed from your use-case:
1) Loads are NT (so loaded data is not cached for future packets)
2) Stores are T (so copied/dst data is now resident in L1/L2)

In theory there might even be valid uses for this type of memcpy where loaded
data is not needed again soon and stored data is referenced again soon,
although I cannot think of any here while typing this mail..

I think some use-case examples, and clear documentation on when/how to choose
between rte_memcpy() or any (potential future) rte_memcpy_nt() variants is required
to progress this patch.

Assuming a strong use-case exists, and it can be clearly indicators to users of DPDK APIs which
rte_memcpy() to use, we can look at technical details around enabling the implementation.

-Harry

<snip remaining points>


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 12:15               ` Van Haaren, Harry
@ 2021-10-27 12:22                 ` Ananyev, Konstantin
  2021-10-27 13:34                   ` Aman Kumar
  0 siblings, 1 reply; 43+ messages in thread
From: Ananyev, Konstantin @ 2021-10-27 12:22 UTC (permalink / raw)
  To: Van Haaren, Harry, mattias.ronnblom, Thomas Monjalon, Aman Kumar
  Cc: dev, viacheslavo, Burakov, Anatoly, Song, Keesang, jerinjacobk,
	Richardson, Bruce, honnappa.nagarahalli, Ruifeng Wang,
	David Christensen, david.marchand, stephen


 
> 
> Hi Mattias,
> 
> > > 6) What is the use-case for this? When would a user *want* to use this instead
> > of rte_memcpy()?
> > > If the data being loaded is relevant to datapath/packets, presumably other
> > packets might require the
> > > loaded data, so temporal (normal) loads should be used to cache the source
> > data?
> >
> >
> > I'm not sure if your first question is rhetorical or not, but a memcpy()
> > in a NT variant is certainly useful. One use case for a memcpy() with
> > temporal loads and non-temporal stores is if you need to archive packet
> > payload for (distant, potential) future use, and want to avoid causing
> > unnecessary LLC evictions while doing so.
> 
> Yes I agree that there are certainly benefits in using cache-locality hints.
> There is an open question around if the src or dst or both are non-temporal.
> 
> In the implementation of this patch, the NT/T type of store is reversed from your use-case:
> 1) Loads are NT (so loaded data is not cached for future packets)
> 2) Stores are T (so copied/dst data is now resident in L1/L2)
> 
> In theory there might even be valid uses for this type of memcpy where loaded
> data is not needed again soon and stored data is referenced again soon,
> although I cannot think of any here while typing this mail..
> 
> I think some use-case examples, and clear documentation on when/how to choose
> between rte_memcpy() or any (potential future) rte_memcpy_nt() variants is required
> to progress this patch.
> 
> Assuming a strong use-case exists, and it can be clearly indicators to users of DPDK APIs which
> rte_memcpy() to use, we can look at technical details around enabling the implementation.
> 

+1 here.
Function behaviour and restrictions (src parameter needs to be 16/32 B aligned, etc.),
along with expected usage scenarios have to be documented properly.
Again, as Harry pointed out, I don't see any AMD specific instructions in this function,
so presumably such function can go into __AVX2__ code block and no new defines will
be required. 

 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 12:22                 ` Ananyev, Konstantin
@ 2021-10-27 13:34                   ` Aman Kumar
  2021-10-27 14:10                     ` Van Haaren, Harry
  2021-10-27 14:26                     ` Ananyev, Konstantin
  0 siblings, 2 replies; 43+ messages in thread
From: Aman Kumar @ 2021-10-27 13:34 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Van Haaren, Harry, mattias.ronnblom, Thomas Monjalon, dev,
	viacheslavo, Burakov, Anatoly, Song, Keesang, jerinjacobk,
	Richardson, Bruce, honnappa.nagarahalli, Ruifeng Wang,
	David Christensen, david.marchand, stephen

On Wed, Oct 27, 2021 at 5:53 PM Ananyev, Konstantin <
konstantin.ananyev@intel.com> wrote

> >
> > Hi Mattias,
> >
> > > > 6) What is the use-case for this? When would a user *want* to use
> this instead
> > > of rte_memcpy()?
> > > > If the data being loaded is relevant to datapath/packets, presumably
> other
> > > packets might require the
> > > > loaded data, so temporal (normal) loads should be used to cache the
> source
> > > data?
> > >
> > >
> > > I'm not sure if your first question is rhetorical or not, but a
> memcpy()
> > > in a NT variant is certainly useful. One use case for a memcpy() with
> > > temporal loads and non-temporal stores is if you need to archive packet
> > > payload for (distant, potential) future use, and want to avoid causing
> > > unnecessary LLC evictions while doing so.
> >
> > Yes I agree that there are certainly benefits in using cache-locality
> hints.
> > There is an open question around if the src or dst or both are
> non-temporal.
> >
> > In the implementation of this patch, the NT/T type of store is reversed
> from your use-case:
> > 1) Loads are NT (so loaded data is not cached for future packets)
> > 2) Stores are T (so copied/dst data is now resident in L1/L2)
> >
> > In theory there might even be valid uses for this type of memcpy where
> loaded
> > data is not needed again soon and stored data is referenced again soon,
> > although I cannot think of any here while typing this mail..
> >
> > I think some use-case examples, and clear documentation on when/how to
> choose
> > between rte_memcpy() or any (potential future) rte_memcpy_nt() variants
> is required
> > to progress this patch.
> >
> > Assuming a strong use-case exists, and it can be clearly indicators to
> users of DPDK APIs which
> > rte_memcpy() to use, we can look at technical details around enabling
> the implementation.
> >
>
> +1 here.
> Function behaviour and restrictions (src parameter needs to be 16/32 B
> aligned, etc.),
> along with expected usage scenarios have to be documented properly.
> Again, as Harry pointed out, I don't see any AMD specific instructions in
> this function,
> so presumably such function can go into __AVX2__ code block and no new
> defines will
> be required.
>
> Agreed that APIs are generic but we've kept under an AMD flag for a
simple reason that it is NOT tested on any other platform.
A use-case on how to use this was planned earlier for mlx5 pmd but dropped
in this version of patch as the data path of mlx5 is going to be refactored
soon and may not be useful for future versions of mlx5 (>22.02).
Ref link: adaptation to mlx5 mprq
<https://patchwork.dpdk.org/project/dpdk/patch/20211019104724.19416-2-aman.kumar@vvdntech.in/>
(*we've plan to adapt this into future version*)
The patch in the link basically enhances mlx5 mprq implementation for our
specific use-case and with 128B packet size, we achieve ~60% better perf.
We understand the use of this copy function should be documented which we
shall plan along with few other platform specific optimizations in future
versions of DPDK. As this does not conflict with other platforms, can we still
keep under AMD flag for now as suggested by Thomas?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 13:34                   ` Aman Kumar
@ 2021-10-27 14:10                     ` Van Haaren, Harry
  2021-10-27 14:31                       ` Thomas Monjalon
  2021-10-27 14:26                     ` Ananyev, Konstantin
  1 sibling, 1 reply; 43+ messages in thread
From: Van Haaren, Harry @ 2021-10-27 14:10 UTC (permalink / raw)
  To: Aman Kumar, Ananyev, Konstantin
  Cc: mattias.ronnblom, Thomas Monjalon, dev, viacheslavo, Burakov,
	Anatoly, Song, Keesang, jerinjacobk, Richardson, Bruce,
	honnappa.nagarahalli, Ruifeng Wang, David Christensen,
	david.marchand, stephen

From: Aman Kumar <aman.kumar@vvdntech.in> 
Sent: Wednesday, October 27, 2021 2:35 PM
To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
Cc: Van Haaren, Harry <harry.van.haaren@intel.com>; mattias.ronnblom <mattias.ronnblom@ericsson.com>; Thomas Monjalon <thomas@monjalon.net>; dev@dpdk.org; viacheslavo@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; Song, Keesang <Keesang.Song@amd.com>; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; honnappa.nagarahalli@arm.com; Ruifeng Wang <ruifeng.wang@arm.com>; David Christensen <drc@linux.vnet.ibm.com>; david.marchand@redhat.com; stephen@networkplumber.org
Subject: Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform

Hi Aman,

Please sent plain-text email, converting to other formats it makes writing inline replies difficult.
I've converted this reply email back to plain-text, and will annotate email below with [<author> wrote]:

On Wed, Oct 27, 2021 at 5:53 PM Ananyev, Konstantin <mailto:konstantin.ananyev@intel.com> wrote
> 
> Hi Mattias,
> 
> > > 6) What is the use-case for this? When would a user *want* to use this instead
> > of rte_memcpy()?
> > > If the data being loaded is relevant to datapath/packets, presumably other
> > packets might require the
> > > loaded data, so temporal (normal) loads should be used to cache the source
> > data?
> >
> >
> > I'm not sure if your first question is rhetorical or not, but a memcpy()
> > in a NT variant is certainly useful. One use case for a memcpy() with
> > temporal loads and non-temporal stores is if you need to archive packet
> > payload for (distant, potential) future use, and want to avoid causing
> > unnecessary LLC evictions while doing so.
> 
> Yes I agree that there are certainly benefits in using cache-locality hints.
> There is an open question around if the src or dst or both are non-temporal.
> 
> In the implementation of this patch, the NT/T type of store is reversed from your use-case:
> 1) Loads are NT (so loaded data is not cached for future packets)
> 2) Stores are T (so copied/dst data is now resident in L1/L2)
> 
> In theory there might even be valid uses for this type of memcpy where loaded
> data is not needed again soon and stored data is referenced again soon,
> although I cannot think of any here while typing this mail..
> 
> I think some use-case examples, and clear documentation on when/how to choose
> between rte_memcpy() or any (potential future) rte_memcpy_nt() variants is required
> to progress this patch.
> 
> Assuming a strong use-case exists, and it can be clearly indicators to users of DPDK APIs which
> rte_memcpy() to use, we can look at technical details around enabling the implementation.
> 

[Konstantin wrote]:
+1 here.
Function behaviour and restrictions (src parameter needs to be 16/32 B aligned, etc.),
along with expected usage scenarios have to be documented properly.
Again, as Harry pointed out, I don't see any AMD specific instructions in this function,
so presumably such function can go into __AVX2__ code block and no new defines will
be required. 

[Aman wrote]:
Agreed that APIs are generic but we've kept under an AMD flag for a simple reason that it is NOT tested on any other platform.
A use-case on how to use this was planned earlier for mlx5 pmd but dropped in this version of patch as the data path of mlx5 is going to be refactored soon and may not be useful for future versions of mlx5 (>22.02). 
Ref link: https://patchwork.dpdk.org/project/dpdk/patch/20211019104724.19416-2-aman.kumar@vvdntech.in/(we've plan to adapt this into future version)
The patch in the link basically enhances mlx5 mprq implementation for our specific use-case and with 128B packet size, we achieve ~60% better perf. We understand the use of this copy function should be documented which we shall plan along with few other platform specific optimizations in future versions of DPDK. As this does not conflict with other platforms, can we still keep under AMD flag for now as suggested by Thomas?

[HvH wrote]:
As an open-source community, any contributions should aim to improve the whole.
In the past, numerous improvements have been merged to DPDK that improve performance.
Sometimes these are architecture specific (x86/arm/ppc) sometimes the are ISA specific (SSE, AVX512, NEON).

I am not familiar with any cases in DPDK, where there is a #ifdef based on a *specific platform*.
A quick "grep" through the "dpdk/lib" directory does not show any place where PMD or generic code
has been explicitly optimized for a *specific platform*.

Obviously, in cases where ISA either exists or does not exist, yes there is an optimization to enable it.
But this is not exposed as a top-level compile-time option, it uses runtime CPU ISA detection.

Please take a step back from the code, and look at what this patch asks of DPDK:
"Please accept & maintain these changes upstream, which benefit only platform X, even though these ISA features are also available on other platforms".

Other patches that enhance performance of DPDK ask this:
"Please accept & maintain these changes upstream, which benefit all platforms which have ISA capability X".

=== Question "As this does not conflict with other platforms, can we still keep under AMD flag for now"?
I feel the contribution is too specific to a platform. Make it generic by enabling it at an ISA capability level.

Please yes, contribute to the DPDK community by improving performance of a PMD by enabling/leveraging ISA.
But do so in a way that does not benefit only a specific platform - do so in a way that enhances all of DPDK, as
other patches have done for the DPDK that this patch is built on.

If you have concerns that the PMD maintainers will not accept the changes due to potential regressions on
other platforms, then discuss those, make a plan on how to performance validate, and work to a solution.

=== Regarding specifically the request for "can we still keep under AMD flag for now"?
I do not believe we should introduce APIs for specific platforms. DPDK's EAL is an abstraction layer.
The value of EAL is to provide a common abstraction. This platform-specific flag breaks the abstraction,
and results in packaging issues, as well as API/ABI instability based on -Dcpu_instruction_set choice.
So, no, we should not introduce APIs based on any compile-time flag.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 13:34                   ` Aman Kumar
  2021-10-27 14:10                     ` Van Haaren, Harry
@ 2021-10-27 14:26                     ` Ananyev, Konstantin
  1 sibling, 0 replies; 43+ messages in thread
From: Ananyev, Konstantin @ 2021-10-27 14:26 UTC (permalink / raw)
  To: Aman Kumar
  Cc: Van Haaren, Harry, mattias.ronnblom, Thomas Monjalon, dev,
	viacheslavo, Burakov, Anatoly, Song, Keesang, jerinjacobk,
	Richardson, Bruce, honnappa.nagarahalli, Ruifeng Wang,
	David Christensen, david.marchand, stephen



From: Aman Kumar <aman.kumar@vvdntech.in> 
Sent: Wednesday, October 27, 2021 2:35 PM
To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
Cc: Van Haaren, Harry <harry.van.haaren@intel.com>; mattias.ronnblom <mattias.ronnblom@ericsson.com>; Thomas Monjalon <thomas@monjalon.net>; dev@dpdk.org; viacheslavo@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; Song, Keesang <Keesang.Song@amd.com>; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; honnappa.nagarahalli@arm.com; Ruifeng Wang <ruifeng.wang@arm.com>; David Christensen <drc@linux.vnet.ibm.com>; david.marchand@redhat.com; stephen@networkplumber.org
Subject: Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform

>> 
>> Hi Mattias,
>> 
>> > > 6) What is the use-case for this? When would a user *want* to use this instead
>> > of rte_memcpy()?
>> > > If the data being loaded is relevant to datapath/packets, presumably other
>> > packets might require the
>> > > loaded data, so temporal (normal) loads should be used to cache the source
>> > data?
>> >
>> >
>> > I'm not sure if your first question is rhetorical or not, but a memcpy()
>> > in a NT variant is certainly useful. One use case for a memcpy() with
>> > temporal loads and non-temporal stores is if you need to archive packet
>> > payload for (distant, potential) future use, and want to avoid causing
>> > unnecessary LLC evictions while doing so.
>> 
>> Yes I agree that there are certainly benefits in using cache-locality hints.
>> There is an open question around if the src or dst or both are non-temporal.
>> 
>> In the implementation of this patch, the NT/T type of store is reversed from your use-case:
>> 1) Loads are NT (so loaded data is not cached for future packets)
>> 2) Stores are T (so copied/dst data is now resident in L1/L2)
>> 
>> In theory there might even be valid uses for this type of memcpy where loaded
>> data is not needed again soon and stored data is referenced again soon,
>> although I cannot think of any here while typing this mail..
>> 
>> I think some use-case examples, and clear documentation on when/how to choose
>> between rte_memcpy() or any (potential future) rte_memcpy_nt() variants is required
>> to progress this patch.
>> 
>> Assuming a strong use-case exists, and it can be clearly indicators to users of DPDK APIs which
>> rte_memcpy() to use, we can look at technical details around enabling the implementation.
>> 
>
> +1 here.
> Function behaviour and restrictions (src parameter needs to be 16/32 B aligned, etc.),
> along with expected usage scenarios have to be documented properly.
> Again, as Harry pointed out, I don't see any AMD specific instructions in this function,
> so presumably such function can go into __AVX2__ code block and no new defines will
> be required. 
> Agreed that APIs are generic but we've kept under an AMD flag for a simple reason that it is NOT tested on any other platform.
> A use-case on how to use this was planned earlier for mlx5 pmd but dropped in this version of patch as the data path of mlx5 is going to be refactored soon and may not be useful for > future versions of mlx5 (>22.02). 
> Ref link: https://patchwork.dpdk.org/project/dpdk/patch/20211019104724.19416-2-aman.kumar@vvdntech.in/(we've plan to adapt this into future version)
> The patch in the link basically enhances mlx5 mprq implementation for our specific use-case and with 128B packet size, we achieve ~60% better perf. We understand the use of this
> copy function should be documented which we shall plan along with few other platform specific optimizations in future versions of DPDK. As this does not conflict with other  >platforms, can we still keep under AMD flag for now as suggested by Thomas?

From what I read above the patch is sort of in the half-ready stage.
Why to rush here and try to push to the DPDK things that doesn't full-fill DPDK policy?
Probably better to do all missing parts first (docs, tests, etc.) and then come-up with updated version.
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 14:10                     ` Van Haaren, Harry
@ 2021-10-27 14:31                       ` Thomas Monjalon
  2021-10-29 16:01                         ` Song, Keesang
  0 siblings, 1 reply; 43+ messages in thread
From: Thomas Monjalon @ 2021-10-27 14:31 UTC (permalink / raw)
  To: Aman Kumar, Ananyev, Konstantin, Van Haaren, Harry
  Cc: mattias. ronnblom, dev, viacheslavo, Burakov, Anatoly, Song,
	Keesang, jerinjacobk, Richardson, Bruce, honnappa.nagarahalli,
	Ruifeng Wang, David Christensen, david.marchand, stephen

27/10/2021 16:10, Van Haaren, Harry:
> From: Aman Kumar <aman.kumar@vvdntech.in> 
> On Wed, Oct 27, 2021 at 5:53 PM Ananyev, Konstantin <mailto:konstantin.ananyev@intel.com> wrote
> > 
> > Hi Mattias,
> > 
> > > > 6) What is the use-case for this? When would a user *want* to use this instead
> > > of rte_memcpy()?
> > > > If the data being loaded is relevant to datapath/packets, presumably other
> > > packets might require the
> > > > loaded data, so temporal (normal) loads should be used to cache the source
> > > data?
> > >
> > >
> > > I'm not sure if your first question is rhetorical or not, but a memcpy()
> > > in a NT variant is certainly useful. One use case for a memcpy() with
> > > temporal loads and non-temporal stores is if you need to archive packet
> > > payload for (distant, potential) future use, and want to avoid causing
> > > unnecessary LLC evictions while doing so.
> > 
> > Yes I agree that there are certainly benefits in using cache-locality hints.
> > There is an open question around if the src or dst or both are non-temporal.
> > 
> > In the implementation of this patch, the NT/T type of store is reversed from your use-case:
> > 1) Loads are NT (so loaded data is not cached for future packets)
> > 2) Stores are T (so copied/dst data is now resident in L1/L2)
> > 
> > In theory there might even be valid uses for this type of memcpy where loaded
> > data is not needed again soon and stored data is referenced again soon,
> > although I cannot think of any here while typing this mail..
> > 
> > I think some use-case examples, and clear documentation on when/how to choose
> > between rte_memcpy() or any (potential future) rte_memcpy_nt() variants is required
> > to progress this patch.
> > 
> > Assuming a strong use-case exists, and it can be clearly indicators to users of DPDK APIs which
> > rte_memcpy() to use, we can look at technical details around enabling the implementation.
> > 
> 
> [Konstantin wrote]:
> +1 here.
> Function behaviour and restrictions (src parameter needs to be 16/32 B aligned, etc.),
> along with expected usage scenarios have to be documented properly.
> Again, as Harry pointed out, I don't see any AMD specific instructions in this function,
> so presumably such function can go into __AVX2__ code block and no new defines will
> be required. 
> 
> 
> [Aman wrote]:
> Agreed that APIs are generic but we've kept under an AMD flag for a simple reason that it is NOT tested on any other platform.
> A use-case on how to use this was planned earlier for mlx5 pmd but dropped in this version of patch as the data path of mlx5 is going to be refactored soon and may not be useful for future versions of mlx5 (>22.02). 
> Ref link: https://patchwork.dpdk.org/project/dpdk/patch/20211019104724.19416-2-aman.kumar@vvdntech.in/(we've plan to adapt this into future version)
> The patch in the link basically enhances mlx5 mprq implementation for our specific use-case and with 128B packet size, we achieve ~60% better perf. We understand the use of this copy function should be documented which we shall plan along with few other platform specific optimizations in future versions of DPDK. As this does not conflict with other platforms, can we still keep under AMD flag for now as suggested by Thomas?

I said I could merge if there is no objection.
I've overlooked that it's adding completely new functions in the API.
And the comments go in the direction of what I asked in previous version:
what is specific to AMD here?
Now seeing the valid objections, I agree it should be reworked.
We must provide API to applications which is generic, stable and well documented.


> [HvH wrote]:
> As an open-source community, any contributions should aim to improve the whole.
> In the past, numerous improvements have been merged to DPDK that improve performance.
> Sometimes these are architecture specific (x86/arm/ppc) sometimes the are ISA specific (SSE, AVX512, NEON).
> 
> I am not familiar with any cases in DPDK, where there is a #ifdef based on a *specific platform*.
> A quick "grep" through the "dpdk/lib" directory does not show any place where PMD or generic code
> has been explicitly optimized for a *specific platform*.
> 
> Obviously, in cases where ISA either exists or does not exist, yes there is an optimization to enable it.
> But this is not exposed as a top-level compile-time option, it uses runtime CPU ISA detection.
> 
> Please take a step back from the code, and look at what this patch asks of DPDK:
> "Please accept & maintain these changes upstream, which benefit only platform X, even though these ISA features are also available on other platforms".
> 
> Other patches that enhance performance of DPDK ask this:
> "Please accept & maintain these changes upstream, which benefit all platforms which have ISA capability X".
> 
> 
> === Question "As this does not conflict with other platforms, can we still keep under AMD flag for now"?
> I feel the contribution is too specific to a platform. Make it generic by enabling it at an ISA capability level.
> 
> Please yes, contribute to the DPDK community by improving performance of a PMD by enabling/leveraging ISA.
> But do so in a way that does not benefit only a specific platform - do so in a way that enhances all of DPDK, as
> other patches have done for the DPDK that this patch is built on.
> 
> If you have concerns that the PMD maintainers will not accept the changes due to potential regressions on
> other platforms, then discuss those, make a plan on how to performance validate, and work to a solution.
> 
> 
> === Regarding specifically the request for "can we still keep under AMD flag for now"?
> I do not believe we should introduce APIs for specific platforms. DPDK's EAL is an abstraction layer.
> The value of EAL is to provide a common abstraction. This platform-specific flag breaks the abstraction,
> and results in packaging issues, as well as API/ABI instability based on -Dcpu_instruction_set choice.
> So, no, we should not introduce APIs based on any compile-time flag.

I agree



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform
  2021-10-27 14:31                       ` Thomas Monjalon
@ 2021-10-29 16:01                         ` Song, Keesang
  0 siblings, 0 replies; 43+ messages in thread
From: Song, Keesang @ 2021-10-29 16:01 UTC (permalink / raw)
  To: Thomas Monjalon, Aman Kumar, Ananyev, Konstantin, Van Haaren, Harry
  Cc: mattias. ronnblom, dev, viacheslavo, Burakov, Anatoly,
	jerinjacobk, Richardson, Bruce, honnappa.nagarahalli,
	Ruifeng Wang, David Christensen, david.marchand, stephen

[AMD Official Use Only]

Hi Thomas,

There are some gaps among us, so I think we really need another quick meeting call to discuss. I will set up a call like the last time on Monday.
Please join in the call if possible.

Thanks,
Keesang

-----Original Message-----
From: Thomas Monjalon <thomas@monjalon.net>
Sent: Wednesday, October 27, 2021 7:31 AM
To: Aman Kumar <aman.kumar@vvdntech.in>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Van Haaren, Harry <harry.van.haaren@intel.com>
Cc: mattias. ronnblom <mattias.ronnblom@ericsson.com>; dev@dpdk.org; viacheslavo@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; Song, Keesang <Keesang.Song@amd.com>; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; honnappa.nagarahalli@arm.com; Ruifeng Wang <ruifeng.wang@arm.com>; David Christensen <drc@linux.vnet.ibm.com>; david.marchand@redhat.com; stephen@networkplumber.org
Subject: Re: [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy support for AMD platform

[CAUTION: External Email]

27/10/2021 16:10, Van Haaren, Harry:
> From: Aman Kumar <aman.kumar@vvdntech.in> On Wed, Oct 27, 2021 at 5:53
> PM Ananyev, Konstantin <mailto:konstantin.ananyev@intel.com> wrote
> >
> > Hi Mattias,
> >
> > > > 6) What is the use-case for this? When would a user *want* to
> > > > use this instead
> > > of rte_memcpy()?
> > > > If the data being loaded is relevant to datapath/packets,
> > > > presumably other
> > > packets might require the
> > > > loaded data, so temporal (normal) loads should be used to cache
> > > > the source
> > > data?
> > >
> > >
> > > I'm not sure if your first question is rhetorical or not, but a
> > > memcpy() in a NT variant is certainly useful. One use case for a
> > > memcpy() with temporal loads and non-temporal stores is if you
> > > need to archive packet payload for (distant, potential) future
> > > use, and want to avoid causing unnecessary LLC evictions while doing so.
> >
> > Yes I agree that there are certainly benefits in using cache-locality hints.
> > There is an open question around if the src or dst or both are non-temporal.
> >
> > In the implementation of this patch, the NT/T type of store is reversed from your use-case:
> > 1) Loads are NT (so loaded data is not cached for future packets)
> > 2) Stores are T (so copied/dst data is now resident in L1/L2)
> >
> > In theory there might even be valid uses for this type of memcpy
> > where loaded data is not needed again soon and stored data is
> > referenced again soon, although I cannot think of any here while typing this mail..
> >
> > I think some use-case examples, and clear documentation on when/how
> > to choose between rte_memcpy() or any (potential future)
> > rte_memcpy_nt() variants is required to progress this patch.
> >
> > Assuming a strong use-case exists, and it can be clearly indicators
> > to users of DPDK APIs which
> > rte_memcpy() to use, we can look at technical details around enabling the implementation.
> >
>
> [Konstantin wrote]:
> +1 here.
> Function behaviour and restrictions (src parameter needs to be 16/32 B
> aligned, etc.), along with expected usage scenarios have to be documented properly.
> Again, as Harry pointed out, I don't see any AMD specific instructions
> in this function, so presumably such function can go into __AVX2__
> code block and no new defines will be required.
>
>
> [Aman wrote]:
> Agreed that APIs are generic but we've kept under an AMD flag for a simple reason that it is NOT tested on any other platform.
> A use-case on how to use this was planned earlier for mlx5 pmd but dropped in this version of patch as the data path of mlx5 is going to be refactored soon and may not be useful for future versions of mlx5 (>22.02).
> Ref link:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.dpdk.org%2Fproject%2Fdpdk%2Fpatch%2F20211019104724.19416-2-aman.kumar%40vvdntech.in%2F&amp;data=04%7C01%7CKeesang.Song%40amd.com%7C1988237087f74375caf808d9995678f0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637709418976849481%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=FErr0cuni6WLxpq5z2KKjAx2StGTlGuN4QaXoXFE%2BKI%3D&amp;reserved=0(we've plan to adapt this into future version) The patch in the link basically enhances mlx5 mprq implementation for our specific use-case and with 128B packet size, we achieve ~60% better perf. We understand the use of this copy function should be documented which we shall plan along with few other platform specific optimizations in future versions of DPDK. As this does not conflict with other platforms, can we still keep under AMD flag for now as suggested by Thomas?

I said I could merge if there is no objection.
I've overlooked that it's adding completely new functions in the API.
And the comments go in the direction of what I asked in previous version:
what is specific to AMD here?
Now seeing the valid objections, I agree it should be reworked.
We must provide API to applications which is generic, stable and well documented.


> [HvH wrote]:
> As an open-source community, any contributions should aim to improve the whole.
> In the past, numerous improvements have been merged to DPDK that improve performance.
> Sometimes these are architecture specific (x86/arm/ppc) sometimes the are ISA specific (SSE, AVX512, NEON).
>
> I am not familiar with any cases in DPDK, where there is a #ifdef based on a *specific platform*.
> A quick "grep" through the "dpdk/lib" directory does not show any
> place where PMD or generic code has been explicitly optimized for a *specific platform*.
>
> Obviously, in cases where ISA either exists or does not exist, yes there is an optimization to enable it.
> But this is not exposed as a top-level compile-time option, it uses runtime CPU ISA detection.
>
> Please take a step back from the code, and look at what this patch asks of DPDK:
> "Please accept & maintain these changes upstream, which benefit only platform X, even though these ISA features are also available on other platforms".
>
> Other patches that enhance performance of DPDK ask this:
> "Please accept & maintain these changes upstream, which benefit all platforms which have ISA capability X".
>
>
> === Question "As this does not conflict with other platforms, can we still keep under AMD flag for now"?
> I feel the contribution is too specific to a platform. Make it generic by enabling it at an ISA capability level.
>
> Please yes, contribute to the DPDK community by improving performance of a PMD by enabling/leveraging ISA.
> But do so in a way that does not benefit only a specific platform - do
> so in a way that enhances all of DPDK, as other patches have done for the DPDK that this patch is built on.
>
> If you have concerns that the PMD maintainers will not accept the
> changes due to potential regressions on other platforms, then discuss those, make a plan on how to performance validate, and work to a solution.
>
>
> === Regarding specifically the request for "can we still keep under AMD flag for now"?
> I do not believe we should introduce APIs for specific platforms. DPDK's EAL is an abstraction layer.
> The value of EAL is to provide a common abstraction. This
> platform-specific flag breaks the abstraction, and results in packaging issues, as well as API/ABI instability based on -Dcpu_instruction_set choice.
> So, no, we should not introduce APIs based on any compile-time flag.

I agree



^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2021-11-01 10:28 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-23  8:44 [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Aman Kumar
2021-08-23  8:44 ` [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms Aman Kumar
2021-10-13 16:53   ` Thomas Monjalon
2021-10-19 10:52     ` Aman Kumar
2021-08-23 15:21 ` [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Jerin Jacob
2021-08-30  9:39   ` Aman Kumar
2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
2021-10-19 10:47   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 plaform Aman Kumar
2021-10-19 12:31   ` [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal Thomas Monjalon
2021-10-19 15:35     ` Stephen Hemminger
2021-10-21 17:10     ` Song, Keesang
2021-10-21 17:40       ` Ananyev, Konstantin
2021-10-21 18:12         ` Song, Keesang
2021-10-21 18:41           ` Thomas Monjalon
2021-10-21 19:03             ` Song, Keesang
2021-10-21 19:50               ` Thomas Monjalon
2021-10-21 20:14   ` Thomas Monjalon
2021-10-22  8:45     ` Bruce Richardson
2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms Aman Kumar
2021-10-26 16:07       ` Thomas Monjalon
2021-10-27  6:30         ` Aman Kumar
2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform Aman Kumar
2021-10-26 16:14       ` Thomas Monjalon
2021-10-27  6:34         ` Aman Kumar
2021-10-27  7:59           ` Thomas Monjalon
2021-10-26 21:10       ` Stephen Hemminger
2021-10-27  6:43         ` Aman Kumar
2021-10-26 16:01     ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for " Thomas Monjalon
2021-10-27  6:26       ` Aman Kumar
2021-10-27  7:28     ` [dpdk-dev] [PATCH v4 1/2] " Aman Kumar
2021-10-27  7:28       ` [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy " Aman Kumar
2021-10-27  8:13         ` Thomas Monjalon
2021-10-27 11:03           ` Van Haaren, Harry
2021-10-27 11:41             ` Mattias Rönnblom
2021-10-27 12:15               ` Van Haaren, Harry
2021-10-27 12:22                 ` Ananyev, Konstantin
2021-10-27 13:34                   ` Aman Kumar
2021-10-27 14:10                     ` Van Haaren, Harry
2021-10-27 14:31                       ` Thomas Monjalon
2021-10-29 16:01                         ` Song, Keesang
2021-10-27 14:26                     ` Ananyev, Konstantin
2021-10-27 11:33         ` Mattias Rönnblom

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).