Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer. rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use inline assembly for the load/store operations. Unaligned load/store operations are permitted in x64 assembly. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- I have just submitted another patch related to rte_memcpy. The changes in the other patch are straightforward. The changes in this patch are likely to require more discussion, so I've decided to keep both patches completely separate. Please let me know if I should have done otherwise. Based on the benchmarks of rte_memcpy that are in the dpdk tree, the changes in this patch do not negatively affect performance. lib/eal/x86/include/rte_memcpy.h | 106 +++++++++++++------------------ 1 file changed, 44 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..d2a6099200 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,47 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + if (n & 0x08) { + asm ("movq (%[src]), %%rax\n" + "movq %%rax, (%[dst])" + : + : [dst] "r" (dst), [src] "r" (src) + : "rax", "memory"); + src = (const uint64_t *)src + 1; + dst = (uint64_t *)dst + 1; + } + if (n & 0x04) { + asm ("movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])" + : + : [dst] "r" (dst), [src] "r" (src) + : "eax", "memory"); + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 0x02) { + asm ("movw (%[src]), %%ax\n" + "movw %%ax, (%[dst])" + : + : [dst] "r" (dst), [src] "r" (src) + : "ax", "memory"); + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 0x01) + *(uint8_t *)dst = *(const uint8_t *)src; + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +212,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +220,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +401,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +409,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +674,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +682,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** -- 2.25.1
Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer. rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use a for loop to copy the bytes one by one. Experimentation on compiler explorer indicates that gcc 7+ (released in 2017) and clang 7+ (released in 2018) both optimize out the for loop with the least number of memory loads and stores, if n is known at compile-time. If n is only known at compile-time, gcc and clang have different behaviour but they both seem to recognize that a memcpy is being done. More recent versions of both gcc/clang seem to also produce even more optimized results. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- I forgot that code under x86 also needs to compile for 32-bit (obviously). So, I did some more experimentation and replaced the assembly code with a regular for loop. Explanations are in the updated commit message. Experimentation was done on compiler explorer here: https://godbolt.org/z/zK54rzPEn lib/eal/x86/include/rte_memcpy.h | 82 ++++++++------------------------ 1 file changed, 20 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..e422397e49 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,23 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + for (; n; n--) { + *((char *)dst) = *((const char *) src); + dst = ((char *)dst) + 1; + src = ((const char *)src) + 1; + } + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +188,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +196,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +377,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +385,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +650,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +658,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** -- 2.25.1
On Sat, 15 Jan 2022 16:39:50 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..e422397e49 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -45,6 +45,23 @@ extern "C" {
> static __rte_always_inline void *
> rte_memcpy(void *dst, const void *src, size_t n);
>
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with unaligned src/dst, and n <= 15.
> + */
> +static __rte_always_inline void *
> +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
> +{
> + void *ret = dst;
> + for (; n; n--) {
> + *((char *)dst) = *((const char *) src);
> + dst = ((char *)dst) + 1;
> + src = ((const char *)src) + 1;
> + }
> + return ret;
> +}
X86 always allows unaligned access. Irregardless of what tools say.
Why impose additional overhead in performance critical code.
> X86 always allows unaligned access. Irregardless of what tools say. > Why impose additional overhead in performance critical code. Let me preface my response by saying that I'm not a C compiler developer. Hopefully someone who is will read this and chime in. I agree that X86 allows unaligned store/load. However, the C standard doesn't, and says that it's undefined behavior. This means that the code relies on undefined behavior. It may do the right thing all the time, almost all the time, some of the time... it's undefined. It may work now but it may stop working in the future. Here's a good discussion on SO about unaligned accesses in C on x86: https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment/46790815#46790815 There's no way to do the unaligned store/load in C (that I know of) without invoking undefined behavior. I can see 2 options, either write the code in assembly, or use some other C construct that doesn't rely on undefined behavior. While the for loop may seem slower than the other options, it surprisingly results in fewer load/store operations in certain scenarios. For example, if n == 15 and it's known at compile-time, the compiler will generate 2 overlapping qword load/store operations (rather than the 4 that are currently being done with the current code). All that being said, I can go back to something similar to my first patch. Using inline assembly, and making sure this time that it works for 32-bit too. I will post a patch in a few minutes that does exactly that. Maintainers can then chime in with their preferred option.
Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer. rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use inline assembly for the load/store operations. Unaligned load/store operations are permitted in x86/x64 assembly. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- Please note that I didn't write the entire function in inline assembly. The reason why I kept the bitwise ands as C code is so the optimizer can remove the branches when n is known at compile-time. lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++-------------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..b99c1b2ca5 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,75 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +#if defined(__i386__) + #define RTE_ACCUMULATOR_REGISTER_NAME "eax" +#elif defined(__x86_64__) + #define RTE_ACCUMULATOR_REGISTER_NAME "rax" +#endif + +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + if (n & 8) { + asm ( +#if defined(__i386__) + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" +#elif defined(__x86_64__) + "movq (%[src]), %%rax\n" + "movq %%rax, (%[dst])\n" + "add $8, %[src]\n" + "add $8, %[dst]\n" +#else + #error Unsupported architecture +#endif + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 4) { + asm ( + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 2) { + asm ( + "movw (%[src]), %%ax\n" + "movw %%ax, (%[dst])\n" + "add $2, %[src]\n" + "add $2, %[dst]\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 1) { + asm ( + "movb (%[src]), %%al\n" + "movb %%al, (%[dst])\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +702,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** -- 2.25.1
As a side note, and to follow up on Stephen's indication that this is 'performance critical code', I think it might be worthwhile to revisit/revalidate the current implementation of rte_memcpy. There's a good thread here that mentions rte_memcpy, and its performance on at least one platform/architecture combination is far from being the best: https://github.com/microsoft/mimalloc/issues/201 It seems like enhanced rep movsb could be faster on more recent CPUs, but that's currently not being used in the current implementation of rte_memcpy. I understand some of this may not be directly related to this patch, but whoever looks at this patch might want to provide their thoughts on whether updating rte_memcpy would be worthwhile? I suspect looking at all current public implementations of memcpy (libc, microsoft, compilers builtin implementations, etc.) might help in making improvements. Le dim. 16 janv. 2022 à 09:15, Luc Pelletier <lucp.at.work@gmail.com> a écrit : > > Calls to rte_memcpy_generic could result in unaligned loads/stores for > 1 < n < 16. This is undefined behavior according to the C standard, > and it gets flagged by the clang undefined behavior sanitizer. > > rte_memcpy_generic is called with unaligned src and dst addresses. > When 1 < n < 16, the code would cast both src and dst to a qword, > dword or word pointer, without verifying the alignment of src/dst. The > code was changed to use inline assembly for the load/store operations. > Unaligned load/store operations are permitted in x86/x64 assembly. > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") > Cc: Xiaoyun Li <xiaoyun.li@intel.com> > Cc: stable@dpdk.org > > Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> > --- > > Please note that I didn't write the entire function in inline assembly. > The reason why I kept the bitwise ands as C code is so the optimizer can > remove the branches when n is known at compile-time. > > lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++-------------- > 1 file changed, 72 insertions(+), 62 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h > index 1b6c6e585f..b99c1b2ca5 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -45,6 +45,75 @@ extern "C" { > static __rte_always_inline void * > rte_memcpy(void *dst, const void *src, size_t n); > > +#if defined(__i386__) > + #define RTE_ACCUMULATOR_REGISTER_NAME "eax" > +#elif defined(__x86_64__) > + #define RTE_ACCUMULATOR_REGISTER_NAME "rax" > +#endif > + > +/** > + * Copy bytes from one location to another, > + * locations should not overlap. > + * Use with unaligned src/dst, and n <= 15. > + */ > +static __rte_always_inline void * > +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) > +{ > + void *ret = dst; > + if (n & 8) { > + asm ( > +#if defined(__i386__) > + "movl (%[src]), %%eax\n" > + "movl %%eax, (%[dst])\n" > + "add $4, %[src]\n" > + "add $4, %[dst]\n" > + "movl (%[src]), %%eax\n" > + "movl %%eax, (%[dst])\n" > + "add $4, %[src]\n" > + "add $4, %[dst]\n" > +#elif defined(__x86_64__) > + "movq (%[src]), %%rax\n" > + "movq %%rax, (%[dst])\n" > + "add $8, %[src]\n" > + "add $8, %[dst]\n" > +#else > + #error Unsupported architecture > +#endif > + : [dst] "+r" (dst), [src] "+r" (src) > + : > + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); > + } > + if (n & 4) { > + asm ( > + "movl (%[src]), %%eax\n" > + "movl %%eax, (%[dst])\n" > + "add $4, %[src]\n" > + "add $4, %[dst]\n" > + : [dst] "+r" (dst), [src] "+r" (src) > + : > + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); > + } > + if (n & 2) { > + asm ( > + "movw (%[src]), %%ax\n" > + "movw %%ax, (%[dst])\n" > + "add $2, %[src]\n" > + "add $2, %[dst]\n" > + : [dst] "+r" (dst), [src] "+r" (src) > + : > + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); > + } > + if (n & 1) { > + asm ( > + "movb (%[src]), %%al\n" > + "movb %%al, (%[dst])\n" > + : [dst] "+r" (dst), [src] "+r" (src) > + : > + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); > + } > + return ret; > +} > + > #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > #define ALIGNMENT_MASK 0x3F > @@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) > static __rte_always_inline void * > rte_memcpy_generic(void *dst, const void *src, size_t n) > { > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > void *ret = dst; > size_t dstofss; > size_t bits; > @@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > * Copy less than 16 bytes > */ > if (n < 16) { > - if (n & 0x01) { > - *(uint8_t *)dstu = *(const uint8_t *)srcu; > - srcu = (uintptr_t)((const uint8_t *)srcu + 1); > - dstu = (uintptr_t)((uint8_t *)dstu + 1); > - } > - if (n & 0x02) { > - *(uint16_t *)dstu = *(const uint16_t *)srcu; > - srcu = (uintptr_t)((const uint16_t *)srcu + 1); > - dstu = (uintptr_t)((uint16_t *)dstu + 1); > - } > - if (n & 0x04) { > - *(uint32_t *)dstu = *(const uint32_t *)srcu; > - srcu = (uintptr_t)((const uint32_t *)srcu + 1); > - dstu = (uintptr_t)((uint32_t *)dstu + 1); > - } > - if (n & 0x08) > - *(uint64_t *)dstu = *(const uint64_t *)srcu; > - return ret; > + return rte_mov15_or_less_unaligned(dst, src, n); > } > > /** > @@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) > static __rte_always_inline void * > rte_memcpy_generic(void *dst, const void *src, size_t n) > { > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > void *ret = dst; > size_t dstofss; > size_t bits; > @@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > * Copy less than 16 bytes > */ > if (n < 16) { > - if (n & 0x01) { > - *(uint8_t *)dstu = *(const uint8_t *)srcu; > - srcu = (uintptr_t)((const uint8_t *)srcu + 1); > - dstu = (uintptr_t)((uint8_t *)dstu + 1); > - } > - if (n & 0x02) { > - *(uint16_t *)dstu = *(const uint16_t *)srcu; > - srcu = (uintptr_t)((const uint16_t *)srcu + 1); > - dstu = (uintptr_t)((uint16_t *)dstu + 1); > - } > - if (n & 0x04) { > - *(uint32_t *)dstu = *(const uint32_t *)srcu; > - srcu = (uintptr_t)((const uint32_t *)srcu + 1); > - dstu = (uintptr_t)((uint32_t *)dstu + 1); > - } > - if (n & 0x08) { > - *(uint64_t *)dstu = *(const uint64_t *)srcu; > - } > - return ret; > + return rte_mov15_or_less_unaligned(dst, src, n); > } > > /** > @@ -672,8 +702,6 @@ static __rte_always_inline void * > rte_memcpy_generic(void *dst, const void *src, size_t n) > { > __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > void *ret = dst; > size_t dstofss; > size_t srcofs; > @@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > * Copy less than 16 bytes > */ > if (n < 16) { > - if (n & 0x01) { > - *(uint8_t *)dstu = *(const uint8_t *)srcu; > - srcu = (uintptr_t)((const uint8_t *)srcu + 1); > - dstu = (uintptr_t)((uint8_t *)dstu + 1); > - } > - if (n & 0x02) { > - *(uint16_t *)dstu = *(const uint16_t *)srcu; > - srcu = (uintptr_t)((const uint16_t *)srcu + 1); > - dstu = (uintptr_t)((uint16_t *)dstu + 1); > - } > - if (n & 0x04) { > - *(uint32_t *)dstu = *(const uint32_t *)srcu; > - srcu = (uintptr_t)((const uint32_t *)srcu + 1); > - dstu = (uintptr_t)((uint32_t *)dstu + 1); > - } > - if (n & 0x08) { > - *(uint64_t *)dstu = *(const uint64_t *)srcu; > - } > - return ret; > + return rte_mov15_or_less_unaligned(dst, src, n); > } > > /** > -- > 2.25.1 >
On Sun, 16 Jan 2022 09:09:49 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:
> > X86 always allows unaligned access. Irregardless of what tools say.
> > Why impose additional overhead in performance critical code.
>
> Let me preface my response by saying that I'm not a C compiler developer.
> Hopefully someone who is will read this and chime in.
>
> I agree that X86 allows unaligned store/load. However, the C standard doesn't,
> and says that it's undefined behavior. This means that the code relies on
> undefined behavior. It may do the right thing all the time, almost all the time,
> some of the time... it's undefined. It may work now but it may stop
> working in the future.
> Here's a good discussion on SO about unaligned accesses in C on x86:
>
> https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment/46790815#46790815
>
> There's no way to do the unaligned store/load in C (that I know of)
> without invoking
> undefined behavior. I can see 2 options, either write the code in
> assembly, or use
> some other C construct that doesn't rely on undefined behavior.
>
> While the for loop may seem slower than the other options, it
> surprisingly results in
> fewer load/store operations in certain scenarios. For example, if n ==
> 15 and it's
> known at compile-time, the compiler will generate 2 overlapping qword load/store
> operations (rather than the 4 that are currently being done with the
> current code).
>
> All that being said, I can go back to something similar to my first
> patch. Using inline
> assembly, and making sure this time that it works for 32-bit too. I
> will post a patch in
> a few minutes that does exactly that. Maintainers can then chime in
> with their preferred
> option.
I would propose that DPDK have same kind of define as the kernel
for SAFE_UNALIGNED_ACCESS. The C standard has to apply to all architectures
but DPDK will make the choice to be fast rather than standards conformant.
On Sun, 16 Jan 2022 09:33:19 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:
> As a side note, and to follow up on Stephen's indication that this is
> 'performance critical code', I think it might be worthwhile to
> revisit/revalidate the current implementation of rte_memcpy. There's a
> good thread here that mentions rte_memcpy, and its performance on at
> least one platform/architecture combination is far from being the
> best:
>
> https://github.com/microsoft/mimalloc/issues/201
>
> It seems like enhanced rep movsb could be faster on more recent CPUs,
> but that's currently not being used in the current implementation of
> rte_memcpy.
>
> I understand some of this may not be directly related to this patch,
> but whoever looks at this patch might want to provide their thoughts
> on whether updating rte_memcpy would be worthwhile? I suspect looking
> at all current public implementations of memcpy (libc, microsoft,
> compilers builtin implementations, etc.) might help in making
> improvements.
I would prefer that rte_memcpy did not exist at all.
Instead the system library should always be used.
It is only exists because some architectures have slower code
in glibc.
> From: Stephen Hemminger [mailto:stephen@networkplumber.org] > Sent: Sunday, 16 January 2022 17.34 > > On Sun, 16 Jan 2022 09:33:19 -0500 > Luc Pelletier <lucp.at.work@gmail.com> wrote: > > > As a side note, and to follow up on Stephen's indication that this is > > 'performance critical code', I think it might be worthwhile to > > revisit/revalidate the current implementation of rte_memcpy. There's > a > > good thread here that mentions rte_memcpy, and its performance on at > > least one platform/architecture combination is far from being the > > best: > > > > https://github.com/microsoft/mimalloc/issues/201 > > > > It seems like enhanced rep movsb could be faster on more recent CPUs, > > but that's currently not being used in the current implementation of > > rte_memcpy. > > > > I understand some of this may not be directly related to this patch, > > but whoever looks at this patch might want to provide their thoughts > > on whether updating rte_memcpy would be worthwhile? I suspect looking > > at all current public implementations of memcpy (libc, microsoft, > > compilers builtin implementations, etc.) might help in making > > improvements. > > I would prefer that rte_memcpy did not exist at all. > Instead the system library should always be used. > > It is only exists because some architectures have slower code > in glibc. I wonder if that is still the case? Otherwise, DPDK is probably full of obsolete optimizations, which should be eliminated like this: http://inbox.dpdk.org/dev/20210918114930.245387-1-mail@gms.tf/
Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer. rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use inline assembly for the load/store operations. Unaligned load/store operations are permitted in x86/x64 assembly. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- Added volatile for asm statements. I had been testing with clang and it was working but it seems gcc requires this, otherwise the asm is completely discarded by the optimizer. The following note in the gcc docs seems to explain why: "If the C code that follows the asm makes no use of any of the output operands, use volatile for the asm statement to prevent the optimizers from discarding the asm statement as unneeded (see Volatile)." lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++-------------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..5af5c2ed61 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,75 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +#if defined(__i386__) + #define RTE_ACCUMULATOR_REGISTER_NAME "eax" +#elif defined(__x86_64__) + #define RTE_ACCUMULATOR_REGISTER_NAME "rax" +#endif + +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + if (n & 8) { + asm volatile( +#if defined(__i386__) + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" +#elif defined(__x86_64__) + "movq (%[src]), %%rax\n" + "movq %%rax, (%[dst])\n" + "add $8, %[src]\n" + "add $8, %[dst]\n" +#else + #error Unsupported architecture +#endif + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 4) { + asm volatile( + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 2) { + asm volatile( + "movw (%[src]), %%ax\n" + "movw %%ax, (%[dst])\n" + "add $2, %[src]\n" + "add $2, %[dst]\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 1) { + asm volatile( + "movb (%[src]), %%al\n" + "movb %%al, (%[dst])\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +702,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** -- 2.25.1
Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer. rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use a packed structure to perform the unaligned load/store operations. This results in unaligned load/store operations to be C standards-compliant. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- Thanks to Stephen's pointer to look at the linux kernel, I was able to find a way to perform the unaligned load/store using pure C code. The new functions added to perform the load/store could likely be moved to a different file and the code duplication could likely be eliminated by using a macro. However, I will hold off on making these changes until I get confirmation from maintainers that this technique is acceptable and this is what we want to move forward with. lib/eal/x86/include/rte_memcpy.h | 142 +++++++++++++++++-------------- 1 file changed, 80 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..4e876d39eb 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,83 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +static __rte_always_inline uint64_t +rte_load_unaligned_uint64(const void *ptr) +{ + struct unaligned_uint64 { uint64_t val; } __rte_packed; + return ((const struct unaligned_uint64 *)ptr)->val; +} + +static __rte_always_inline uint32_t +rte_load_unaligned_uint32(const void *ptr) +{ + struct unaligned_uint32 { uint32_t val; } __rte_packed; + return ((const struct unaligned_uint32 *)ptr)->val; +} + +static __rte_always_inline uint16_t +rte_load_unaligned_uint16(const void *ptr) +{ + struct unaligned_uint16 { uint16_t val; } __rte_packed; + return ((const struct unaligned_uint16 *)ptr)->val; +} + +static __rte_always_inline void +rte_store_unaligned_uint64(void *ptr, uint64_t val) +{ + struct unaligned_uint64 { uint64_t val; } __rte_packed; + ((struct unaligned_uint64 *)ptr)->val = val; +} + +static __rte_always_inline void +rte_store_unaligned_uint32(void *ptr, uint32_t val) +{ + struct unaligned_uint32 { uint32_t val; } __rte_packed; + ((struct unaligned_uint32 *)ptr)->val = val; +} + +static __rte_always_inline void +rte_store_unaligned_uint16(void *ptr, uint16_t val) +{ + struct unaligned_uint16 { uint16_t val; } __rte_packed; + ((struct unaligned_uint16 *)ptr)->val = val; +} + +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + if (n & 8) { + rte_store_unaligned_uint64( + dst, + rte_load_unaligned_uint64(src)); + src = ((const uint64_t *)src + 1); + dst = ((uint64_t *)dst + 1); + } + if (n & 4) { + rte_store_unaligned_uint32( + dst, + rte_load_unaligned_uint32(src)); + src = ((const uint32_t *)src + 1); + dst = ((uint32_t *)dst + 1); + } + if (n & 2) { + rte_store_unaligned_uint16( + dst, + rte_load_unaligned_uint16(src)); + src = ((const uint16_t *)src + 1); + dst = ((uint16_t *)dst + 1); + } + if (n & 1) + *(uint8_t *)dst = *(const uint8_t *)src; + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +248,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +256,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +437,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +445,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +710,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +718,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** -- 2.25.1
Hi, It's been several days and I haven't received any additional feedback on this patch (and the other patch related to this one: http://mails.dpdk.org/archives/dev/2022-January/232491.html). I would welcome any kind of feedback. Ideally, I'm hoping an authoritative voice would indicate if there's any interest in updating rte_memcpy and if so, comment on the different possible solutions presented thus far in this thread. Thanks
> Calls to rte_memcpy_generic could result in unaligned loads/stores for > 1 < n < 16. This is undefined behavior according to the C standard, > and it gets flagged by the clang undefined behavior sanitizer. > > rte_memcpy_generic is called with unaligned src and dst addresses. > When 1 < n < 16, the code would cast both src and dst to a qword, > dword or word pointer, without verifying the alignment of src/dst. The > code was changed to use a packed structure to perform the unaligned > load/store operations. This results in unaligned load/store operations > to be C standards-compliant. Still not sure we need to fix that: This is x86 specific code-path, and as I remember on x86 there are no penalties for unaligned access to 2/4/8 byte values. Though I like introduction of rte_mov15_or_less() function -t helps with code dedup. > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") > Cc: Xiaoyun Li <xiaoyun.li@intel.com> > Cc: stable@dpdk.org > > Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> > --- > > Thanks to Stephen's pointer to look at the linux kernel, I was able to > find a way to perform the unaligned load/store using pure C code. The > new functions added to perform the load/store could likely be moved to a > different file and the code duplication could likely be eliminated by > using a macro. However, I will hold off on making these changes until I > get confirmation from maintainers that this technique is acceptable and > this is what we want to move forward with. > > lib/eal/x86/include/rte_memcpy.h | 142 +++++++++++++++++-------------- > 1 file changed, 80 insertions(+), 62 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h > index 1b6c6e585f..4e876d39eb 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -45,6 +45,83 @@ extern "C" { > static __rte_always_inline void * > rte_memcpy(void *dst, const void *src, size_t n); > > +static __rte_always_inline uint64_t > +rte_load_unaligned_uint64(const void *ptr) > +{ > + struct unaligned_uint64 { uint64_t val; } __rte_packed; > + return ((const struct unaligned_uint64 *)ptr)->val; > +} > + > +static __rte_always_inline uint32_t > +rte_load_unaligned_uint32(const void *ptr) > +{ > + struct unaligned_uint32 { uint32_t val; } __rte_packed; > + return ((const struct unaligned_uint32 *)ptr)->val; > +} > + > +static __rte_always_inline uint16_t > +rte_load_unaligned_uint16(const void *ptr) > +{ > + struct unaligned_uint16 { uint16_t val; } __rte_packed; > + return ((const struct unaligned_uint16 *)ptr)->val; > +} > + > +static __rte_always_inline void > +rte_store_unaligned_uint64(void *ptr, uint64_t val) > +{ > + struct unaligned_uint64 { uint64_t val; } __rte_packed; > + ((struct unaligned_uint64 *)ptr)->val = val; > +} > + > +static __rte_always_inline void > +rte_store_unaligned_uint32(void *ptr, uint32_t val) > +{ > + struct unaligned_uint32 { uint32_t val; } __rte_packed; > + ((struct unaligned_uint32 *)ptr)->val = val; > +} > + > +static __rte_always_inline void > +rte_store_unaligned_uint16(void *ptr, uint16_t val) > +{ > + struct unaligned_uint16 { uint16_t val; } __rte_packed; > + ((struct unaligned_uint16 *)ptr)->val = val; > +} > + > +/** > + * Copy bytes from one location to another, > + * locations should not overlap. > + * Use with unaligned src/dst, and n <= 15. > + */ > +static __rte_always_inline void * > +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) > +{ > + void *ret = dst; > + if (n & 8) { > + rte_store_unaligned_uint64( > + dst, > + rte_load_unaligned_uint64(src)); > + src = ((const uint64_t *)src + 1); > + dst = ((uint64_t *)dst + 1); > + } > + if (n & 4) { > + rte_store_unaligned_uint32( > + dst, > + rte_load_unaligned_uint32(src)); > + src = ((const uint32_t *)src + 1); > + dst = ((uint32_t *)dst + 1); > + } > + if (n & 2) { > + rte_store_unaligned_uint16( > + dst, > + rte_load_unaligned_uint16(src)); > + src = ((const uint16_t *)src + 1); > + dst = ((uint16_t *)dst + 1); > + } > + if (n & 1) > + *(uint8_t *)dst = *(const uint8_t *)src; > + return ret; > +} > + > #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > #define ALIGNMENT_MASK 0x3F > @@ -171,8 +248,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) > static __rte_always_inline void * > rte_memcpy_generic(void *dst, const void *src, size_t n) > { > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > void *ret = dst; > size_t dstofss; > size_t bits; > @@ -181,24 +256,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > * Copy less than 16 bytes > */ > if (n < 16) { > - if (n & 0x01) { > - *(uint8_t *)dstu = *(const uint8_t *)srcu; > - srcu = (uintptr_t)((const uint8_t *)srcu + 1); > - dstu = (uintptr_t)((uint8_t *)dstu + 1); > - } > - if (n & 0x02) { > - *(uint16_t *)dstu = *(const uint16_t *)srcu; > - srcu = (uintptr_t)((const uint16_t *)srcu + 1); > - dstu = (uintptr_t)((uint16_t *)dstu + 1); > - } > - if (n & 0x04) { > - *(uint32_t *)dstu = *(const uint32_t *)srcu; > - srcu = (uintptr_t)((const uint32_t *)srcu + 1); > - dstu = (uintptr_t)((uint32_t *)dstu + 1); > - } > - if (n & 0x08) > - *(uint64_t *)dstu = *(const uint64_t *)srcu; > - return ret; > + return rte_mov15_or_less_unaligned(dst, src, n); > } > > /** > @@ -379,8 +437,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) > static __rte_always_inline void * > rte_memcpy_generic(void *dst, const void *src, size_t n) > { > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > void *ret = dst; > size_t dstofss; > size_t bits; > @@ -389,25 +445,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > * Copy less than 16 bytes > */ > if (n < 16) { > - if (n & 0x01) { > - *(uint8_t *)dstu = *(const uint8_t *)srcu; > - srcu = (uintptr_t)((const uint8_t *)srcu + 1); > - dstu = (uintptr_t)((uint8_t *)dstu + 1); > - } > - if (n & 0x02) { > - *(uint16_t *)dstu = *(const uint16_t *)srcu; > - srcu = (uintptr_t)((const uint16_t *)srcu + 1); > - dstu = (uintptr_t)((uint16_t *)dstu + 1); > - } > - if (n & 0x04) { > - *(uint32_t *)dstu = *(const uint32_t *)srcu; > - srcu = (uintptr_t)((const uint32_t *)srcu + 1); > - dstu = (uintptr_t)((uint32_t *)dstu + 1); > - } > - if (n & 0x08) { > - *(uint64_t *)dstu = *(const uint64_t *)srcu; > - } > - return ret; > + return rte_mov15_or_less_unaligned(dst, src, n); > } > > /** > @@ -672,8 +710,6 @@ static __rte_always_inline void * > rte_memcpy_generic(void *dst, const void *src, size_t n) > { > __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > void *ret = dst; > size_t dstofss; > size_t srcofs; > @@ -682,25 +718,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > * Copy less than 16 bytes > */ > if (n < 16) { > - if (n & 0x01) { > - *(uint8_t *)dstu = *(const uint8_t *)srcu; > - srcu = (uintptr_t)((const uint8_t *)srcu + 1); > - dstu = (uintptr_t)((uint8_t *)dstu + 1); > - } > - if (n & 0x02) { > - *(uint16_t *)dstu = *(const uint16_t *)srcu; > - srcu = (uintptr_t)((const uint16_t *)srcu + 1); > - dstu = (uintptr_t)((uint16_t *)dstu + 1); > - } > - if (n & 0x04) { > - *(uint32_t *)dstu = *(const uint32_t *)srcu; > - srcu = (uintptr_t)((const uint32_t *)srcu + 1); > - dstu = (uintptr_t)((uint32_t *)dstu + 1); > - } > - if (n & 0x08) { > - *(uint64_t *)dstu = *(const uint64_t *)srcu; > - } > - return ret; > + return rte_mov15_or_less_unaligned(dst, src, n); > } > > /** > -- > 2.25.1
04/02/2022 18:16, Ananyev, Konstantin:
>
> > Calls to rte_memcpy_generic could result in unaligned loads/stores for
> > 1 < n < 16. This is undefined behavior according to the C standard,
> > and it gets flagged by the clang undefined behavior sanitizer.
> >
> > rte_memcpy_generic is called with unaligned src and dst addresses.
> > When 1 < n < 16, the code would cast both src and dst to a qword,
> > dword or word pointer, without verifying the alignment of src/dst. The
> > code was changed to use a packed structure to perform the unaligned
> > load/store operations. This results in unaligned load/store operations
> > to be C standards-compliant.
>
> Still not sure we need to fix that:
> This is x86 specific code-path, and as I remember on x86 there are no
> penalties for unaligned access to 2/4/8 byte values.
> Though I like introduction of rte_mov15_or_less() function -t helps
> with code dedup.
Any more opinions?
> > Calls to rte_memcpy_generic could result in unaligned loads/stores for
> > 1 < n < 16. This is undefined behavior according to the C standard,
> > and it gets flagged by the clang undefined behavior sanitizer.
> >
> > rte_memcpy_generic is called with unaligned src and dst addresses.
> > When 1 < n < 16, the code would cast both src and dst to a qword,
> > dword or word pointer, without verifying the alignment of src/dst. The
> > code was changed to use a packed structure to perform the unaligned
> > load/store operations. This results in unaligned load/store operations
> > to be C standards-compliant.
>
> Still not sure we need to fix that:
> This is x86 specific code-path, and as I remember on x86 there are no
> penalties for unaligned access to 2/4/8 byte values.
> Though I like introduction of rte_mov15_or_less() function -t helps
> with code dedup.
Thanks for your input Konstantin. Much appreciated. Just to make sure
I understand, can you please confirm that we do not want to fix the
fact that unaligned access in C is undefined behaviour? I understand
that X86 allows unaligned accesses but since this isn't assembly code,
we have to go by what the C standard allows.
Also, do you have any opinion on the strict aliasing violation (as was
pointed out by Georg)? I suggested adding __may_alias__ thinking it
would likely fix the issue, but I'd really like to get confirmation
from someone who has better knowledge of how the attribute works
exactly.
Thanks
Hi Luc, > > > Calls to rte_memcpy_generic could result in unaligned loads/stores for > > > 1 < n < 16. This is undefined behavior according to the C standard, > > > and it gets flagged by the clang undefined behavior sanitizer. > > > > > > rte_memcpy_generic is called with unaligned src and dst addresses. > > > When 1 < n < 16, the code would cast both src and dst to a qword, > > > dword or word pointer, without verifying the alignment of src/dst. The > > > code was changed to use a packed structure to perform the unaligned > > > load/store operations. This results in unaligned load/store operations > > > to be C standards-compliant. > > > > Still not sure we need to fix that: > > This is x86 specific code-path, and as I remember on x86 there are no > > penalties for unaligned access to 2/4/8 byte values. > > Though I like introduction of rte_mov15_or_less() function -t helps > > with code dedup. > > Thanks for your input Konstantin. Much appreciated. Just to make sure > I understand, can you please confirm that we do not want to fix the > fact that unaligned access in C is undefined behaviour? Yes, I don't think it is a real problem in that particular case. > I understand > that X86 allows unaligned accesses but since this isn't assembly code, > we have to go by what the C standard allows. > Also, do you have any opinion on the strict aliasing violation (as was > pointed out by Georg)? I suggested adding __may_alias__ thinking it > would likely fix the issue, but I'd really like to get confirmation > from someone who has better knowledge of how the attribute works > exactly. Not sure I understand the problem you are referring to. Are you saying that original rte_memcpy() code breaks strict aliasing? If so, could you point where exactly?
Hi Konstantin, > > Thanks for your input Konstantin. Much appreciated. Just to make sure > > I understand, can you please confirm that we do not want to fix the > > fact that unaligned access in C is undefined behaviour? > > Yes, I don't think it is a real problem in that particular case. Perfect. Thank you. > > Also, do you have any opinion on the strict aliasing violation (as was > > pointed out by Georg)? I suggested adding __may_alias__ thinking it > > would likely fix the issue, but I'd really like to get confirmation > > from someone who has better knowledge of how the attribute works > > exactly. > > Not sure I understand the problem you are referring to. > Are you saying that original rte_memcpy() code breaks strict aliasing? > If so, could you point where exactly? As far as I understand, yes, it does break strict aliasing. For example, in the following line: *(uint64_t *)dstu = *(const uint64_t *)srcu; IIUC, both casts break strict aliasing rules. While the src/dst parameters are void* and can therefore be cast to something else without breaking strict aliasing rules, the type of src/dst in the calling code might be something other than uint64_t*. This can result in src/dst pointers being cast to different unrelated types. AFAICT, the fact that rte_memcpy is "always inline" increases the risk of the compiler making an optimization that results in broken code. I was able to come up with an example where the latest version of GCC produces broken code when strict aliasing is enabled: https://godbolt.org/z/3Yzvjr97c With -fstrict-aliasing, it reorders a write and results in broken code. If you change the compiler flags to -fno-strict-aliasing, it produces the expected result.
> > Not sure I understand the problem you are referring to.
> > Are you saying that original rte_memcpy() code breaks strict aliasing?
> > If so, could you point where exactly?
>
> As far as I understand, yes, it does break strict aliasing. For
> example, in the following line:
>
> *(uint64_t *)dstu = *(const uint64_t *)srcu;
>
> IIUC, both casts break strict aliasing rules. While the src/dst
> parameters are void* and can therefore be cast to something else
> without breaking strict aliasing rules, the type of src/dst in the
> calling code might be something other than uint64_t*. This can result
> in src/dst pointers being cast to different unrelated types. AFAICT,
> the fact that rte_memcpy is "always inline" increases the risk of the
> compiler making an optimization that results in broken code.
>
> I was able to come up with an example where the latest version of GCC
> produces broken code when strict aliasing is enabled:
>
> https://godbolt.org/z/3Yzvjr97c
>
> With -fstrict-aliasing, it reorders a write and results in broken
> code. If you change the compiler flags to -fno-strict-aliasing, it
> produces the expected result.
Indeed it looks like a problem.
Thanks for pointing it out.
Was able to reproduce it with gcc 11 (clang 13 seems fine).
Actually, adding ' __attribute__ ((__may_alias__))' for both dst and src
didn't quire the problem.
To overcome it, I had to either:
add '-fno-strict-aliasing' CC flag (as you mentioned above),
or add:
if (__builtin_constant_p(n))
return memcpy(dst, src, n);
on top of rte_memcpy() code.
Though I suppose the problem might be much wider than just rte_memcpy().
We do have similar inline copying code in other places too.
As understand some of such cases also might be affected.
Let say: '_rte_ring_(enqueue|dequeue_elems_*'.
Not sure what would be the best approach in general for such cases:
- always compile DPDK code with '-fno-strict-aliasing'
But that wouldn't prevent people to use our inline functions without that flag.
Also wonder what performance impact it will have.
- Try to fix all such occurrences manually (but it would be hard to catch all of them upfront)
- Something else ...?
Wonder what do people think about it?
Konstantin
Hi Konstantin, > Indeed it looks like a problem. > Thanks for pointing it out. > Was able to reproduce it with gcc 11 (clang 13 seems fine). > Actually, adding ' __attribute__ ((__may_alias__))' for both dst and src > didn't quire the problem. __may_alias__ works if it's applied to a typedef, see the following for a modified version of my original example that works and uses __may_alias__: https://godbolt.org/z/W83zzoePq The documentation I found for __may_alias__ is quite sparse, so I'm a little wary of assuming it'll always work. I'm hoping someone with more experience with the attribute would be able to add more confidence to the assumption that it'll work in all cases.
Hi Luc, > > Indeed it looks like a problem. > > Thanks for pointing it out. > > Was able to reproduce it with gcc 11 (clang 13 seems fine). > > Actually, adding ' __attribute__ ((__may_alias__))' for both dst and src > > didn't quire the problem. > > __may_alias__ works if it's applied to a typedef, see the following > for a modified version of my original example that works and uses > __may_alias__: > > https://godbolt.org/z/W83zzoePq Interesting, I played with '__may_alias__' attribute a bit more, and yes, as you said, it seems to wok properly only with typedefs. If so, then we probably can add a new typedefs (in rte_common.h or so), and use it in rte_memcpy and probably few other places, where we think it will be necessary. > The documentation I found for __may_alias__ is quite sparse, so I'm a > little wary of assuming it'll always work. > I'm hoping someone with > more experience with the attribute would be able to add more > confidence to the assumption that it'll work in all cases. Agree, would like to see more input too. CC-ing to other guys, who seemed to be interested in that discussion before. Thanks Konstantin
Calls to rte_memcpy for 1 < n < 16 could result in unaligned loads/stores, which is undefined behaviour according to the C standard, and strict aliasing violations. The code was changed to use a packed structure that allows aliasing (using the __may_alias__ attribute) to perform the load/store operations. This results in code that has the same performance as the original code and that is also C standard-compliant. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- v6: * Refocus to fix strict aliasing problems discovered following discussions in this thread. * Modified the code to use __may_alias__ and packed structure. This fixes both the undefined behaviour of unaligned access (which is not the main concern), and also fixes the strict aliasing violations (which can cause major bugs, as demonstrated in a previous message in this thread). * Renamed new function from rte_mov15_or_less_unaligned to rte_mov15_or_less. * Modified code that copies <= 15 bytes to call rte_mov15_or_less. v5: * Replaced assembly with pure C code that uses a packed structure to make unaligned loads conform to C standard. v4: * Added volatile to asm statements, which is required under gcc. v3: * Removed for loop and went back to using assembly. v2: * Replaced assembly with a regular for loop that copies bytes one by one. v1: * Fix undefined behaviour of unaligned stores/loads by using assembly to perform stores/loads. lib/eal/x86/include/rte_memcpy.h | 133 ++++++++++++------------------- 1 file changed, 51 insertions(+), 82 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..360b7e069e 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,52 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less(void *dst, const void *src, size_t n) +{ + /** + * Use the following structs to avoid violating C standard + * alignment requirements and to avoid strict aliasing bugs + */ + struct rte_uint64_alias { + uint64_t val; + } __attribute__((__may_alias__, __packed__)); + struct rte_uint32_alias { + uint32_t val; + } __attribute__((__may_alias__, __packed__)); + struct rte_uint16_alias { + uint16_t val; + } __attribute__((__may_alias__, __packed__)); + + void *ret = dst; + if (n & 8) { + ((struct rte_uint64_alias *)dst)->val = + ((const struct rte_uint64_alias *)src)->val; + src = (const uint64_t *)src + 1; + dst = (uint64_t *)dst + 1; + } + if (n & 4) { + ((struct rte_uint32_alias *)dst)->val = + ((const struct rte_uint32_alias *)src)->val; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 2) { + ((struct rte_uint16_alias *)dst)->val = + ((const struct rte_uint16_alias *)src)->val; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 1) + *(uint8_t *)dst = *(const uint8_t *)src; + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +217,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +225,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less(dst, src, n); } /** @@ -379,8 +406,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +414,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less(dst, src, n); } /** @@ -672,8 +679,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +687,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less(dst, src, n); } /** @@ -818,27 +805,9 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) { void *ret = dst; - /* Copy size <= 16 bytes */ + /* Copy size < 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dst = *(const uint8_t *)src; - src = (const uint8_t *)src + 1; - dst = (uint8_t *)dst + 1; - } - if (n & 0x02) { - *(uint16_t *)dst = *(const uint16_t *)src; - src = (const uint16_t *)src + 1; - dst = (uint16_t *)dst + 1; - } - if (n & 0x04) { - *(uint32_t *)dst = *(const uint32_t *)src; - src = (const uint32_t *)src + 1; - dst = (uint32_t *)dst + 1; - } - if (n & 0x08) - *(uint64_t *)dst = *(const uint64_t *)src; - - return ret; + return rte_mov15_or_less(dst, src, n); } /* Copy 16 <= size <= 32 bytes */ -- 2.35.1
Calls to rte_memcpy for 1 < n < 16 could result in unaligned loads/stores, which is undefined behaviour according to the C standard, and strict aliasing violations. The code was changed to use a packed structure that allows aliasing (using the __may_alias__ attribute) to perform the load/store operations. This results in code that has the same performance as the original code and that is also C standards-compliant. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun.li@intel.com> Cc: stable@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> --- v7: * Fix coding style issue by adding new __rte_may_alias macro rather than directly use __attribute__ v6: * Refocus to fix strict aliasing problems discovered following discussions in this thread. * Modified the code to use __may_alias__ and packed structure. This fixes both the undefined behaviour of unaligned access (which is not the main concern), and also fixes the strict aliasing violations (which can cause major bugs, as demonstrated in a previous message in this thread). * Renamed new function from rte_mov15_or_less_unaligned to rte_mov15_or_less. * Modified code that copies <= 15 bytes to call rte_mov15_or_less. v5: * Replaced assembly with pure C code that uses a packed structure to make unaligned loads conform to C standard. v4: * Added volatile to asm statements, which is required under gcc. v3: * Removed for loop and went back to using assembly. v2: * Replaced assembly with a regular for loop that copies bytes one by one. v1: * Fix undefined behaviour of unaligned stores/loads by using assembly to perform stores/loads. lib/eal/include/rte_common.h | 5 ++ lib/eal/x86/include/rte_memcpy.h | 133 ++++++++++++------------------- 2 files changed, 56 insertions(+), 82 deletions(-) diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h index 4a399cc7c8..2f1ec69f3d 100644 --- a/lib/eal/include/rte_common.h +++ b/lib/eal/include/rte_common.h @@ -85,6 +85,11 @@ typedef uint16_t unaligned_uint16_t; */ #define __rte_packed __attribute__((__packed__)) +/** + * Macro to mark a type that is not subject to type-based aliasing rules + */ +#define __rte_may_alias __attribute__((__may_alias__)) + /******* Macro to mark functions and fields scheduled for removal *****/ #define __rte_deprecated __attribute__((__deprecated__)) #define __rte_deprecated_msg(msg) __attribute__((__deprecated__(msg))) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..18aa4e43a7 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,52 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less(void *dst, const void *src, size_t n) +{ + /** + * Use the following structs to avoid violating C standard + * alignment requirements and to avoid strict aliasing bugs + */ + struct rte_uint64_alias { + uint64_t val; + } __rte_packed __rte_may_alias; + struct rte_uint32_alias { + uint32_t val; + } __rte_packed __rte_may_alias; + struct rte_uint16_alias { + uint16_t val; + } __rte_packed __rte_may_alias; + + void *ret = dst; + if (n & 8) { + ((struct rte_uint64_alias *)dst)->val = + ((const struct rte_uint64_alias *)src)->val; + src = (const uint64_t *)src + 1; + dst = (uint64_t *)dst + 1; + } + if (n & 4) { + ((struct rte_uint32_alias *)dst)->val = + ((const struct rte_uint32_alias *)src)->val; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 2) { + ((struct rte_uint16_alias *)dst)->val = + ((const struct rte_uint16_alias *)src)->val; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 1) + *(uint8_t *)dst = *(const uint8_t *)src; + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +217,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +225,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less(dst, src, n); } /** @@ -379,8 +406,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +414,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less(dst, src, n); } /** @@ -672,8 +679,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +687,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less(dst, src, n); } /** @@ -818,27 +805,9 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) { void *ret = dst; - /* Copy size <= 16 bytes */ + /* Copy size < 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dst = *(const uint8_t *)src; - src = (const uint8_t *)src + 1; - dst = (uint8_t *)dst + 1; - } - if (n & 0x02) { - *(uint16_t *)dst = *(const uint16_t *)src; - src = (const uint16_t *)src + 1; - dst = (uint16_t *)dst + 1; - } - if (n & 0x04) { - *(uint32_t *)dst = *(const uint32_t *)src; - src = (const uint32_t *)src + 1; - dst = (uint32_t *)dst + 1; - } - if (n & 0x08) - *(uint64_t *)dst = *(const uint64_t *)src; - - return ret; + return rte_mov15_or_less(dst, src, n); } /* Copy 16 <= size <= 32 bytes */ -- 2.35.1
> Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> loads/stores, which is undefined behaviour according to the C
> standard, and strict aliasing violations.
>
> The code was changed to use a packed structure that allows aliasing
> (using the __may_alias__ attribute) to perform the load/store
> operations. This results in code that has the same performance as the
> original code and that is also C standards-compliant.
>
> Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> Cc: Xiaoyun Li <xiaoyun.li@intel.com>
> Cc: stable@dpdk.org
>
> Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> ---
> v7:
> * Fix coding style issue by adding new __rte_may_alias macro rather
> than directly use __attribute__
>
> v6:
> * Refocus to fix strict aliasing problems discovered following
> discussions in this thread.
> * Modified the code to use __may_alias__ and packed structure. This fixes
> both the undefined behaviour of unaligned access (which is not the main
> concern), and also fixes the strict aliasing violations (which can cause
> major bugs, as demonstrated in a previous message in this thread).
> * Renamed new function from rte_mov15_or_less_unaligned to
> rte_mov15_or_less.
> * Modified code that copies <= 15 bytes to call rte_mov15_or_less.
>
> v5:
> * Replaced assembly with pure C code that uses a packed structure to make
> unaligned loads conform to C standard.
>
> v4:
> * Added volatile to asm statements, which is required under gcc.
>
> v3:
> * Removed for loop and went back to using assembly.
>
> v2:
> * Replaced assembly with a regular for loop that copies bytes one by
> one.
>
> v1:
> * Fix undefined behaviour of unaligned stores/loads by using assembly
> to perform stores/loads.
LGTM, thanks for highlighting and fixing the problem.
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
As a side note, we probably need to check other similar places in DPDK code.
On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote: > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned > > loads/stores, which is undefined behaviour according to the C > > standard, and strict aliasing violations. > > > > The code was changed to use a packed structure that allows aliasing > > (using the __may_alias__ attribute) to perform the load/store > > operations. This results in code that has the same performance as the > > original code and that is also C standards-compliant. > > > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") > > Cc: stable@dpdk.org > > > > Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> > Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com> > Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com> Thanks, applied. > As a side note, we probably need to check other similar places in DPDK code. What would be the best way to detect those problematic places? I tried UBsan, and it did report some of the issues fixed with this patch. -- David Marchand
On Thu, Apr 7, 2022 at 5:24 PM David Marchand <david.marchand@redhat.com> wrote:
>
> On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> > > loads/stores, which is undefined behaviour according to the C
> > > standard, and strict aliasing violations.
> > >
> > > The code was changed to use a packed structure that allows aliasing
> > > (using the __may_alias__ attribute) to perform the load/store
> > > operations. This results in code that has the same performance as the
> > > original code and that is also C standards-compliant.
> > >
> > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Actually, looking again at the history, it fixes:
Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")
I'll change before pushing.
--
David Marchand
On Thu, Apr 7, 2022 at 5:32 PM David Marchand <david.marchand@redhat.com> wrote:
>
> On Thu, Apr 7, 2022 at 5:24 PM David Marchand <david.marchand@redhat.com> wrote:
> >
> > On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin
> > <konstantin.ananyev@intel.com> wrote:
> > > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> > > > loads/stores, which is undefined behaviour according to the C
> > > > standard, and strict aliasing violations.
> > > >
> > > > The code was changed to use a packed structure that allows aliasing
> > > > (using the __may_alias__ attribute) to perform the load/store
> > > > operations. This results in code that has the same performance as the
> > > > original code and that is also C standards-compliant.
> > > >
> > > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
>
> Actually, looking again at the history, it fixes:
> Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")
Nop, that's probably even older, could you double check?
I'll hold on pushing this fix.
--
David Marchand
Hi David, Le jeu. 7 avr. 2022 à 11:24, David Marchand <david.marchand@redhat.com> a écrit : > > As a side note, we probably need to check other similar places in DPDK code. > > What would be the best way to detect those problematic places? As far as I'm aware, there is no silver bullet to detect all strict aliasing violations. A good summary of the different options are described here: https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#catching-strict-aliasing-violations However, this is not 100% and might still miss some strict aliasing violations. The bottom line is that anywhere there's a cast to something other than void* or char*, it could be a strict aliasing violation. So, doing an exhaustive search throughout the code base for casts seems like the only (tedious, time-consuming) solution.
Hi David,
> > Actually, looking again at the history, it fixes:
> > Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")
>
> Nop, that's probably even older, could you double check?
> I'll hold on pushing this fix.
It seems you still haven't received a response. I'll take a stab at this.
I think it fixes:
Fixes: 76746eb13f08 ("eal/x86: fix strict aliasing rules")
However, the ordering of the ifs (from 1 to 8, rather than 8 to 1)
seems to date back to the first commit (af75078fece3). So, I'm not
sure how you want to handle this.
Thanks.
Hello Luc,
On Fri, May 13, 2022 at 9:16 PM Luc Pelletier <lucp.at.work@gmail.com> wrote:
> > > Actually, looking again at the history, it fixes:
> > > Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")
> >
> > Nop, that's probably even older, could you double check?
> > I'll hold on pushing this fix.
>
> It seems you still haven't received a response. I'll take a stab at this.
>
> I think it fixes:
>
> Fixes: 76746eb13f08 ("eal/x86: fix strict aliasing rules")
>
> However, the ordering of the ifs (from 1 to 8, rather than 8 to 1)
> seems to date back to the first commit (af75078fece3). So, I'm not
> sure how you want to handle this.
My understanding was that the issue was there since "day 1".
I'll go with the latter then.
Backporting will be for all LTS branches in any case.
Thanks!
--
David Marchand
On Thu, Apr 7, 2022 at 5:24 PM David Marchand <david.marchand@redhat.com> wrote: > On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin > <konstantin.ananyev@intel.com> wrote: > > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned > > > loads/stores, which is undefined behaviour according to the C > > > standard, and strict aliasing violations. > > > > > > The code was changed to use a packed structure that allows aliasing > > > (using the __may_alias__ attribute) to perform the load/store > > > operations. This results in code that has the same performance as the > > > original code and that is also C standards-compliant. > > > Fixes: af75078fece3 ("first public release") > > > Cc: stable@dpdk.org > > > > > > Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com> > > Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com> > > Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com> > Applied, thanks Luc. -- David Marchand