DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic
@ 2022-01-15 19:41 Luc Pelletier
  2022-01-15 21:39 ` [PATCH v2] " Luc Pelletier
                   ` (5 more replies)
  0 siblings, 6 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-01-15 19:41 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use inline assembly for the load/store operations.
Unaligned load/store operations are permitted in x64 assembly.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---

I have just submitted another patch related to rte_memcpy. The changes
in the other patch are straightforward. The changes in this patch are
likely to require more discussion, so I've decided to keep both patches
completely separate. Please let me know if I should have done otherwise.
Based on the benchmarks of rte_memcpy that are in the dpdk tree, the
changes in this patch do not negatively affect performance.

 lib/eal/x86/include/rte_memcpy.h | 106 +++++++++++++------------------
 1 file changed, 44 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..d2a6099200 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,47 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	if (n & 0x08) {
+		asm ("movq (%[src]), %%rax\n"
+		"movq %%rax, (%[dst])"
+		:
+		: [dst] "r" (dst), [src] "r" (src)
+		: "rax", "memory");
+		src = (const uint64_t *)src + 1;
+		dst = (uint64_t *)dst + 1;
+	}
+	if (n & 0x04) {
+		asm ("movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])"
+		:
+		: [dst] "r" (dst), [src] "r" (src)
+		: "eax", "memory");
+		src = (const uint32_t *)src + 1;
+		dst = (uint32_t *)dst + 1;
+	}
+	if (n & 0x02) {
+		asm ("movw (%[src]), %%ax\n"
+		"movw %%ax, (%[dst])"
+		:
+		: [dst] "r" (dst), [src] "r" (src)
+		: "ax", "memory");
+		src = (const uint16_t *)src + 1;
+		dst = (uint16_t *)dst + 1;
+	}
+	if (n & 0x01)
+		*(uint8_t *)dst = *(const uint8_t *)src;
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +212,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +220,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +401,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +409,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +674,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +682,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
@ 2022-01-15 21:39 ` Luc Pelletier
  2022-01-15 22:13   ` Stephen Hemminger
  2022-01-16 14:13 ` [PATCH v3] " Luc Pelletier
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-01-15 21:39 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use a for loop to copy the bytes one by one.
Experimentation on compiler explorer indicates that gcc 7+
(released in 2017) and clang 7+ (released in 2018) both optimize out the
for loop with the least number of memory loads and stores, if n is known
at compile-time. If n is only known at compile-time, gcc and clang have
different behaviour but they both seem to recognize that a memcpy is
being done. More recent versions of both gcc/clang seem to also produce
even more optimized results.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---

I forgot that code under x86 also needs to compile for 32-bit
(obviously). So, I did some more experimentation and replaced the
assembly code with a regular for loop. Explanations are in the updated
commit message. Experimentation was done on compiler explorer here:
https://godbolt.org/z/zK54rzPEn

 lib/eal/x86/include/rte_memcpy.h | 82 ++++++++------------------------
 1 file changed, 20 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..e422397e49 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,23 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	for (; n; n--) {
+		*((char *)dst) = *((const char *) src);
+		dst = ((char *)dst) + 1;
+		src = ((const char *)src) + 1;
+	}
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +188,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +196,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +377,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +385,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +650,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +658,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-15 21:39 ` [PATCH v2] " Luc Pelletier
@ 2022-01-15 22:13   ` Stephen Hemminger
  2022-01-16 14:09     ` Luc Pelletier
  0 siblings, 1 reply; 33+ messages in thread
From: Stephen Hemminger @ 2022-01-15 22:13 UTC (permalink / raw)
  To: Luc Pelletier
  Cc: bruce.richardson, konstantin.ananyev, dev, Xiaoyun Li, stable

On Sat, 15 Jan 2022 16:39:50 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:

> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..e422397e49 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -45,6 +45,23 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
>  
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with unaligned src/dst, and n <= 15.
> + */
> +static __rte_always_inline void *
> +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
> +{
> +	void *ret = dst;
> +	for (; n; n--) {
> +		*((char *)dst) = *((const char *) src);
> +		dst = ((char *)dst) + 1;
> +		src = ((const char *)src) + 1;
> +	}
> +	return ret;
> +}

X86 always allows unaligned access. Irregardless of what tools say.
Why impose additional overhead in performance critical code.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-15 22:13   ` Stephen Hemminger
@ 2022-01-16 14:09     ` Luc Pelletier
  2022-01-16 16:32       ` Stephen Hemminger
  0 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-01-16 14:09 UTC (permalink / raw)
  To: stephen
  Cc: bruce.richardson, konstantin.ananyev, dev, Xiaoyun Li, dpdk stable

> X86 always allows unaligned access. Irregardless of what tools say.
> Why impose additional overhead in performance critical code.

Let me preface my response by saying that I'm not a C compiler developer.
Hopefully someone who is will read this and chime in.

I agree that X86 allows unaligned store/load. However, the C standard doesn't,
and says that it's undefined behavior. This means that the code relies on
undefined behavior. It may do the right thing all the time, almost all the time,
some of the time... it's undefined. It may work now but it may stop
working in the future.
Here's a good discussion on SO about unaligned accesses in C on x86:

https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment/46790815#46790815

There's no way to do the unaligned store/load in C (that I know of)
without invoking
undefined behavior. I can see 2 options, either write the code in
assembly, or use
some other C construct that doesn't rely on undefined behavior.

While the for loop may seem slower than the other options, it
surprisingly results in
fewer load/store operations in certain scenarios. For example, if n ==
15 and it's
known at compile-time, the compiler will generate 2 overlapping qword load/store
operations (rather than the 4 that are currently being done with the
current code).

All that being said, I can go back to something similar to my first
patch. Using inline
assembly, and making sure this time that it works for 32-bit too. I
will post a patch in
a few minutes that does exactly that. Maintainers can then chime in
with their preferred
option.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
  2022-01-15 21:39 ` [PATCH v2] " Luc Pelletier
@ 2022-01-16 14:13 ` Luc Pelletier
  2022-01-16 14:33   ` Luc Pelletier
  2022-01-16 20:33 ` [PATCH v4] " Luc Pelletier
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-01-16 14:13 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use inline assembly for the load/store operations.
Unaligned load/store operations are permitted in x86/x64 assembly.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---

Please note that I didn't write the entire function in inline assembly.
The reason why I kept the bitwise ands as C code is so the optimizer can
remove the branches when n is known at compile-time.

 lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..b99c1b2ca5 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,75 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+#if defined(__i386__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "eax"
+#elif defined(__x86_64__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "rax"
+#endif
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	if (n & 8) {
+		asm (
+#if defined(__i386__)
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+#elif defined(__x86_64__)
+		"movq (%[src]), %%rax\n"
+		"movq %%rax, (%[dst])\n"
+		"add $8, %[src]\n"
+		"add $8, %[dst]\n"
+#else
+		#error Unsupported architecture
+#endif
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 4) {
+		asm (
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 2) {
+		asm (
+		"movw (%[src]), %%ax\n"
+		"movw %%ax, (%[dst])\n"
+		"add $2, %[src]\n"
+		"add $2, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 1) {
+		asm (
+		"movb (%[src]), %%al\n"
+		"movb %%al, (%[dst])\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +702,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-16 14:13 ` [PATCH v3] " Luc Pelletier
@ 2022-01-16 14:33   ` Luc Pelletier
  2022-01-16 16:34     ` Stephen Hemminger
  0 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-01-16 14:33 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev; +Cc: dev, Xiaoyun Li, dpdk stable

As a side note, and to follow up on Stephen's indication that this is
'performance critical code', I think it might be worthwhile to
revisit/revalidate the current implementation of rte_memcpy. There's a
good thread here that mentions rte_memcpy, and its performance on at
least one platform/architecture combination is far from being the
best:

https://github.com/microsoft/mimalloc/issues/201

It seems like enhanced rep movsb could be faster on more recent CPUs,
but that's currently not being used in the current implementation of
rte_memcpy.

I understand some of this may not be directly related to this patch,
but whoever looks at this patch might want to provide their thoughts
on whether updating rte_memcpy would be worthwhile? I suspect looking
at all current public implementations of memcpy (libc, microsoft,
compilers builtin implementations, etc.) might help in making
improvements.


Le dim. 16 janv. 2022 à 09:15, Luc Pelletier <lucp.at.work@gmail.com> a écrit :
>
> Calls to rte_memcpy_generic could result in unaligned loads/stores for
> 1 < n < 16. This is undefined behavior according to the C standard,
> and it gets flagged by the clang undefined behavior sanitizer.
>
> rte_memcpy_generic is called with unaligned src and dst addresses.
> When 1 < n < 16, the code would cast both src and dst to a qword,
> dword or word pointer, without verifying the alignment of src/dst. The
> code was changed to use inline assembly for the load/store operations.
> Unaligned load/store operations are permitted in x86/x64 assembly.
>
> Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> Cc: Xiaoyun Li <xiaoyun.li@intel.com>
> Cc: stable@dpdk.org
>
> Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> ---
>
> Please note that I didn't write the entire function in inline assembly.
> The reason why I kept the bitwise ands as C code is so the optimizer can
> remove the branches when n is known at compile-time.
>
>  lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
>  1 file changed, 72 insertions(+), 62 deletions(-)
>
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..b99c1b2ca5 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -45,6 +45,75 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
>
> +#if defined(__i386__)
> +       #define RTE_ACCUMULATOR_REGISTER_NAME "eax"
> +#elif defined(__x86_64__)
> +       #define RTE_ACCUMULATOR_REGISTER_NAME "rax"
> +#endif
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with unaligned src/dst, and n <= 15.
> + */
> +static __rte_always_inline void *
> +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
> +{
> +       void *ret = dst;
> +       if (n & 8) {
> +               asm (
> +#if defined(__i386__)
> +               "movl (%[src]), %%eax\n"
> +               "movl %%eax, (%[dst])\n"
> +               "add $4, %[src]\n"
> +               "add $4, %[dst]\n"
> +               "movl (%[src]), %%eax\n"
> +               "movl %%eax, (%[dst])\n"
> +               "add $4, %[src]\n"
> +               "add $4, %[dst]\n"
> +#elif defined(__x86_64__)
> +               "movq (%[src]), %%rax\n"
> +               "movq %%rax, (%[dst])\n"
> +               "add $8, %[src]\n"
> +               "add $8, %[dst]\n"
> +#else
> +               #error Unsupported architecture
> +#endif
> +               : [dst] "+r" (dst), [src] "+r" (src)
> +               :
> +               : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> +       }
> +       if (n & 4) {
> +               asm (
> +               "movl (%[src]), %%eax\n"
> +               "movl %%eax, (%[dst])\n"
> +               "add $4, %[src]\n"
> +               "add $4, %[dst]\n"
> +               : [dst] "+r" (dst), [src] "+r" (src)
> +               :
> +               : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> +       }
> +       if (n & 2) {
> +               asm (
> +               "movw (%[src]), %%ax\n"
> +               "movw %%ax, (%[dst])\n"
> +               "add $2, %[src]\n"
> +               "add $2, %[dst]\n"
> +               : [dst] "+r" (dst), [src] "+r" (src)
> +               :
> +               : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> +       }
> +       if (n & 1) {
> +               asm (
> +               "movb (%[src]), %%al\n"
> +               "movb %%al, (%[dst])\n"
> +               : [dst] "+r" (dst), [src] "+r" (src)
> +               :
> +               : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> +       }
> +       return ret;
> +}
> +
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>
>  #define ALIGNMENT_MASK 0x3F
> @@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> -       uintptr_t dstu = (uintptr_t)dst;
> -       uintptr_t srcu = (uintptr_t)src;
>         void *ret = dst;
>         size_t dstofss;
>         size_t bits;
> @@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>          * Copy less than 16 bytes
>          */
>         if (n < 16) {
> -               if (n & 0x01) {
> -                       *(uint8_t *)dstu = *(const uint8_t *)srcu;
> -                       srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -               }
> -               if (n & 0x02) {
> -                       *(uint16_t *)dstu = *(const uint16_t *)srcu;
> -                       srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -               }
> -               if (n & 0x04) {
> -                       *(uint32_t *)dstu = *(const uint32_t *)srcu;
> -                       srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -               }
> -               if (n & 0x08)
> -                       *(uint64_t *)dstu = *(const uint64_t *)srcu;
> -               return ret;
> +               return rte_mov15_or_less_unaligned(dst, src, n);
>         }
>
>         /**
> @@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> -       uintptr_t dstu = (uintptr_t)dst;
> -       uintptr_t srcu = (uintptr_t)src;
>         void *ret = dst;
>         size_t dstofss;
>         size_t bits;
> @@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>          * Copy less than 16 bytes
>          */
>         if (n < 16) {
> -               if (n & 0x01) {
> -                       *(uint8_t *)dstu = *(const uint8_t *)srcu;
> -                       srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -               }
> -               if (n & 0x02) {
> -                       *(uint16_t *)dstu = *(const uint16_t *)srcu;
> -                       srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -               }
> -               if (n & 0x04) {
> -                       *(uint32_t *)dstu = *(const uint32_t *)srcu;
> -                       srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -               }
> -               if (n & 0x08) {
> -                       *(uint64_t *)dstu = *(const uint64_t *)srcu;
> -               }
> -               return ret;
> +               return rte_mov15_or_less_unaligned(dst, src, n);
>         }
>
>         /**
> @@ -672,8 +702,6 @@ static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
>         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> -       uintptr_t dstu = (uintptr_t)dst;
> -       uintptr_t srcu = (uintptr_t)src;
>         void *ret = dst;
>         size_t dstofss;
>         size_t srcofs;
> @@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>          * Copy less than 16 bytes
>          */
>         if (n < 16) {
> -               if (n & 0x01) {
> -                       *(uint8_t *)dstu = *(const uint8_t *)srcu;
> -                       srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -               }
> -               if (n & 0x02) {
> -                       *(uint16_t *)dstu = *(const uint16_t *)srcu;
> -                       srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -               }
> -               if (n & 0x04) {
> -                       *(uint32_t *)dstu = *(const uint32_t *)srcu;
> -                       srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -                       dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -               }
> -               if (n & 0x08) {
> -                       *(uint64_t *)dstu = *(const uint64_t *)srcu;
> -               }
> -               return ret;
> +               return rte_mov15_or_less_unaligned(dst, src, n);
>         }
>
>         /**
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-16 14:09     ` Luc Pelletier
@ 2022-01-16 16:32       ` Stephen Hemminger
  2022-01-24 23:21         ` Georg Sauthoff
  0 siblings, 1 reply; 33+ messages in thread
From: Stephen Hemminger @ 2022-01-16 16:32 UTC (permalink / raw)
  To: Luc Pelletier
  Cc: bruce.richardson, konstantin.ananyev, dev, Xiaoyun Li, dpdk stable

On Sun, 16 Jan 2022 09:09:49 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:

> > X86 always allows unaligned access. Irregardless of what tools say.
> > Why impose additional overhead in performance critical code.  
> 
> Let me preface my response by saying that I'm not a C compiler developer.
> Hopefully someone who is will read this and chime in.
> 
> I agree that X86 allows unaligned store/load. However, the C standard doesn't,
> and says that it's undefined behavior. This means that the code relies on
> undefined behavior. It may do the right thing all the time, almost all the time,
> some of the time... it's undefined. It may work now but it may stop
> working in the future.
> Here's a good discussion on SO about unaligned accesses in C on x86:
> 
> https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment/46790815#46790815
> 
> There's no way to do the unaligned store/load in C (that I know of)
> without invoking
> undefined behavior. I can see 2 options, either write the code in
> assembly, or use
> some other C construct that doesn't rely on undefined behavior.
> 
> While the for loop may seem slower than the other options, it
> surprisingly results in
> fewer load/store operations in certain scenarios. For example, if n ==
> 15 and it's
> known at compile-time, the compiler will generate 2 overlapping qword load/store
> operations (rather than the 4 that are currently being done with the
> current code).
> 
> All that being said, I can go back to something similar to my first
> patch. Using inline
> assembly, and making sure this time that it works for 32-bit too. I
> will post a patch in
> a few minutes that does exactly that. Maintainers can then chime in
> with their preferred
> option.

I would propose that DPDK have same kind of define as the kernel
for SAFE_UNALIGNED_ACCESS.  The C standard has to apply to all architectures
but DPDK will make the choice to be fast rather than standards conformant.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-16 14:33   ` Luc Pelletier
@ 2022-01-16 16:34     ` Stephen Hemminger
  2022-01-16 17:59       ` Morten Brørup
  0 siblings, 1 reply; 33+ messages in thread
From: Stephen Hemminger @ 2022-01-16 16:34 UTC (permalink / raw)
  To: Luc Pelletier
  Cc: bruce.richardson, konstantin.ananyev, dev, Xiaoyun Li, dpdk stable

On Sun, 16 Jan 2022 09:33:19 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:

> As a side note, and to follow up on Stephen's indication that this is
> 'performance critical code', I think it might be worthwhile to
> revisit/revalidate the current implementation of rte_memcpy. There's a
> good thread here that mentions rte_memcpy, and its performance on at
> least one platform/architecture combination is far from being the
> best:
> 
> https://github.com/microsoft/mimalloc/issues/201
> 
> It seems like enhanced rep movsb could be faster on more recent CPUs,
> but that's currently not being used in the current implementation of
> rte_memcpy.
> 
> I understand some of this may not be directly related to this patch,
> but whoever looks at this patch might want to provide their thoughts
> on whether updating rte_memcpy would be worthwhile? I suspect looking
> at all current public implementations of memcpy (libc, microsoft,
> compilers builtin implementations, etc.) might help in making
> improvements.

I would prefer that rte_memcpy did not exist at all.
Instead the system library should always be used.

It is only exists because some architectures have slower code
in glibc.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-16 16:34     ` Stephen Hemminger
@ 2022-01-16 17:59       ` Morten Brørup
  0 siblings, 0 replies; 33+ messages in thread
From: Morten Brørup @ 2022-01-16 17:59 UTC (permalink / raw)
  To: Stephen Hemminger, Luc Pelletier
  Cc: bruce.richardson, konstantin.ananyev, dev, Xiaoyun Li,
	dpdk stable, Georg Sauthoff

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Sunday, 16 January 2022 17.34
> 
> On Sun, 16 Jan 2022 09:33:19 -0500
> Luc Pelletier <lucp.at.work@gmail.com> wrote:
> 
> > As a side note, and to follow up on Stephen's indication that this is
> > 'performance critical code', I think it might be worthwhile to
> > revisit/revalidate the current implementation of rte_memcpy. There's
> a
> > good thread here that mentions rte_memcpy, and its performance on at
> > least one platform/architecture combination is far from being the
> > best:
> >
> > https://github.com/microsoft/mimalloc/issues/201
> >
> > It seems like enhanced rep movsb could be faster on more recent CPUs,
> > but that's currently not being used in the current implementation of
> > rte_memcpy.
> >
> > I understand some of this may not be directly related to this patch,
> > but whoever looks at this patch might want to provide their thoughts
> > on whether updating rte_memcpy would be worthwhile? I suspect looking
> > at all current public implementations of memcpy (libc, microsoft,
> > compilers builtin implementations, etc.) might help in making
> > improvements.
> 
> I would prefer that rte_memcpy did not exist at all.
> Instead the system library should always be used.
> 
> It is only exists because some architectures have slower code
> in glibc.

I wonder if that is still the case?

Otherwise, DPDK is probably full of obsolete optimizations, which should be eliminated like this:

http://inbox.dpdk.org/dev/20210918114930.245387-1-mail@gms.tf/



^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v4] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
  2022-01-15 21:39 ` [PATCH v2] " Luc Pelletier
  2022-01-16 14:13 ` [PATCH v3] " Luc Pelletier
@ 2022-01-16 20:33 ` Luc Pelletier
  2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-01-16 20:33 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use inline assembly for the load/store operations.
Unaligned load/store operations are permitted in x86/x64 assembly.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---

Added volatile for asm statements. I had been testing with clang and it
was working but it seems gcc requires this, otherwise the asm is
completely discarded by the optimizer. The following note in the gcc
docs seems to explain why:

"If the C code that follows the asm makes no use of any of the output
operands, use volatile for the asm statement to prevent the optimizers
from discarding the asm statement as unneeded (see Volatile)."

 lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..5af5c2ed61 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,75 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+#if defined(__i386__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "eax"
+#elif defined(__x86_64__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "rax"
+#endif
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	if (n & 8) {
+		asm volatile(
+#if defined(__i386__)
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+#elif defined(__x86_64__)
+		"movq (%[src]), %%rax\n"
+		"movq %%rax, (%[dst])\n"
+		"add $8, %[src]\n"
+		"add $8, %[dst]\n"
+#else
+		#error Unsupported architecture
+#endif
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 4) {
+		asm volatile(
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 2) {
+		asm volatile(
+		"movw (%[src]), %%ax\n"
+		"movw %%ax, (%[dst])\n"
+		"add $2, %[src]\n"
+		"add $2, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 1) {
+		asm volatile(
+		"movb (%[src]), %%al\n"
+		"movb %%al, (%[dst])\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +702,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
                   ` (2 preceding siblings ...)
  2022-01-16 20:33 ` [PATCH v4] " Luc Pelletier
@ 2022-01-17 15:37 ` Luc Pelletier
  2022-02-04 16:42   ` Luc Pelletier
  2022-02-04 17:16   ` Ananyev, Konstantin
  2022-02-25 15:51 ` [PATCH v6] eal: fix rte_memcpy strict aliasing/alignment bugs Luc Pelletier
  2022-02-25 16:38 ` [PATCH v7] " Luc Pelletier
  5 siblings, 2 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-01-17 15:37 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use a packed structure to perform the unaligned
load/store operations. This results in unaligned load/store operations
to be C standards-compliant.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---

Thanks to Stephen's pointer to look at the linux kernel, I was able to
find a way to perform the unaligned load/store using pure C code. The
new functions added to perform the load/store could likely be moved to a
different file and the code duplication could likely be eliminated by
using a macro. However, I will hold off on making these changes until I
get confirmation from maintainers that this technique is acceptable and
this is what we want to move forward with.

 lib/eal/x86/include/rte_memcpy.h | 142 +++++++++++++++++--------------
 1 file changed, 80 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..4e876d39eb 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,83 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+static __rte_always_inline uint64_t
+rte_load_unaligned_uint64(const void *ptr)
+{
+	struct unaligned_uint64 { uint64_t val; } __rte_packed;
+	return ((const struct unaligned_uint64 *)ptr)->val;
+}
+
+static __rte_always_inline uint32_t
+rte_load_unaligned_uint32(const void *ptr)
+{
+	struct unaligned_uint32 { uint32_t val; } __rte_packed;
+	return ((const struct unaligned_uint32 *)ptr)->val;
+}
+
+static __rte_always_inline uint16_t
+rte_load_unaligned_uint16(const void *ptr)
+{
+	struct unaligned_uint16 { uint16_t val; } __rte_packed;
+	return ((const struct unaligned_uint16 *)ptr)->val;
+}
+
+static __rte_always_inline void
+rte_store_unaligned_uint64(void *ptr, uint64_t val)
+{
+	struct unaligned_uint64 { uint64_t val; } __rte_packed;
+	((struct unaligned_uint64 *)ptr)->val = val;
+}
+
+static __rte_always_inline void
+rte_store_unaligned_uint32(void *ptr, uint32_t val)
+{
+	struct unaligned_uint32 { uint32_t val; } __rte_packed;
+	((struct unaligned_uint32 *)ptr)->val = val;
+}
+
+static __rte_always_inline void
+rte_store_unaligned_uint16(void *ptr, uint16_t val)
+{
+	struct unaligned_uint16 { uint16_t val; } __rte_packed;
+	((struct unaligned_uint16 *)ptr)->val = val;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	if (n & 8) {
+		rte_store_unaligned_uint64(
+			dst,
+			rte_load_unaligned_uint64(src));
+		src = ((const uint64_t *)src + 1);
+		dst = ((uint64_t *)dst + 1);
+	}
+	if (n & 4) {
+		rte_store_unaligned_uint32(
+			dst,
+			rte_load_unaligned_uint32(src));
+		src = ((const uint32_t *)src + 1);
+		dst = ((uint32_t *)dst + 1);
+	}
+	if (n & 2) {
+		rte_store_unaligned_uint16(
+			dst,
+			rte_load_unaligned_uint16(src));
+		src = ((const uint16_t *)src + 1);
+		dst = ((uint16_t *)dst + 1);
+	}
+	if (n & 1)
+		*(uint8_t *)dst = *(const uint8_t *)src;
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +248,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +256,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +437,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +445,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +710,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +718,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-16 16:32       ` Stephen Hemminger
@ 2022-01-24 23:21         ` Georg Sauthoff
  2022-01-25  7:59           ` Morten Brørup
  2022-01-25 19:57           ` Luc Pelletier
  0 siblings, 2 replies; 33+ messages in thread
From: Georg Sauthoff @ 2022-01-24 23:21 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Luc Pelletier, bruce.richardson, konstantin.ananyev, dev,
	Xiaoyun Li, Morten Brørup

Hello,

Stephen Hemminger wrote (Sun, 16 Jan 2022 08:32:20 -0800):

> I would propose that DPDK have same kind of define as the kernel
> for SAFE_UNALIGNED_ACCESS.  The C standard has to apply to all architectures
> but DPDK will make the choice to be fast rather than standards conformant.

perhaps it's worth mentioning that the Linux Kernel is compiled with
-fno-strict-aliasing, because it contains code which violates the C
strict aliasing rules. Such code yields undefined behavior and is thus
unsafe when compiling with optmizing compilers such as GCC and Clang, by
default. But since the Linux supplies -fno-strict-aliasing its code is
safe, in the Linux Kernel context.

In contrast, DPDK isn't compiled with -fno-strict-aliasing, in general.
Only a few drivers are built with -Wno-strict-aliasing.

Thus, one has to be careful when importing (or being inspired) by the
above mentioned kernel defines.

Especially, it looks like version 5 of this patch yields undefined
behavior because it violates strict-aliasing rules.
It's possible that GCC makes some extra guarantees for the used
constructs, even when compiling without -fno-strict-aliasing. But I'm
not aware of any.

Please note that there is a reason GCC enables -fstrict-aliasing when
compiling with optimizations: Performance
IOW, -fno-strict-aliasing has performance implications, thus one is
advised to not use it, if possible.
IOW, when violating strict-aliasng rules one can easily end up with
low-performing and/or unsafe (buggy) code.

I haven't inspected the DPDK drivers which supply -Wno-strict-aliasing -
but I wouldn't be surprised if there aren't better alternatives. Meaning
writing the code in a way that doesn't require -Wno-strict-aliasing.

BTW, are there still systems that DPDK supports but have a memcpy() which
performs worse than rte_memcpy()?


Best regards,
Georg

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-24 23:21         ` Georg Sauthoff
@ 2022-01-25  7:59           ` Morten Brørup
  2022-01-25 19:57           ` Luc Pelletier
  1 sibling, 0 replies; 33+ messages in thread
From: Morten Brørup @ 2022-01-25  7:59 UTC (permalink / raw)
  To: Georg Sauthoff, Stephen Hemminger
  Cc: Luc Pelletier, bruce.richardson, konstantin.ananyev, dev, Xiaoyun Li

> From: Georg Sauthoff [mailto:mail@gms.tf]
> Sent: Tuesday, 25 January 2022 00.22
> 
> Hello,
> 
> Stephen Hemminger wrote (Sun, 16 Jan 2022 08:32:20 -0800):
> 
> > I would propose that DPDK have same kind of define as the kernel
> > for SAFE_UNALIGNED_ACCESS.  The C standard has to apply to all
> architectures
> > but DPDK will make the choice to be fast rather than standards
> conformant.
> 
> perhaps it's worth mentioning that the Linux Kernel is compiled with
> -fno-strict-aliasing, because it contains code which violates the C
> strict aliasing rules. Such code yields undefined behavior and is thus
> unsafe when compiling with optmizing compilers such as GCC and Clang,
> by
> default. But since the Linux supplies -fno-strict-aliasing its code is
> safe, in the Linux Kernel context.
> 
> In contrast, DPDK isn't compiled with -fno-strict-aliasing, in general.
> Only a few drivers are built with -Wno-strict-aliasing.
> 
> Thus, one has to be careful when importing (or being inspired) by the
> above mentioned kernel defines.
> 
> Especially, it looks like version 5 of this patch yields undefined
> behavior because it violates strict-aliasing rules.
> It's possible that GCC makes some extra guarantees for the used
> constructs, even when compiling without -fno-strict-aliasing. But I'm
> not aware of any.
> 
> Please note that there is a reason GCC enables -fstrict-aliasing when
> compiling with optimizations: Performance
> IOW, -fno-strict-aliasing has performance implications, thus one is
> advised to not use it, if possible.
> IOW, when violating strict-aliasng rules one can easily end up with
> low-performing and/or unsafe (buggy) code.

Generally, I have an extremely strong preference for being able to compile code with all warnings enabled in the compiler (-Wall and then some). This gives the developer a great bug-detector tool (through the compiler warnings).

In my experience, compiling a Linux loadable module with all warnings enabled spews out too many warnings about the kernel itself to be helpful. This stems from bad practice, which should not be adopted by DPDK.

> 
> I haven't inspected the DPDK drivers which supply -Wno-strict-aliasing
> -
> but I wouldn't be surprised if there aren't better alternatives.
> Meaning
> writing the code in a way that doesn't require -Wno-strict-aliasing.
> 
> BTW, are there still systems that DPDK supports but have a memcpy()
> which
> performs worse than rte_memcpy()?

Excellent question! I have also seen memmove() being replaced with a comment about some systems using temporary memory for their implementation of memmove(). And most DPDK core libraries implement their own copy functions - following the general DPDK advice not to use standard C library functions. It's a shame there's no documentation why all this was re-implemented in DPDK, so it can be checked if it is still relevant or could be removed.

> 
> 
> Best regards,
> Georg


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-24 23:21         ` Georg Sauthoff
  2022-01-25  7:59           ` Morten Brørup
@ 2022-01-25 19:57           ` Luc Pelletier
  1 sibling, 0 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-01-25 19:57 UTC (permalink / raw)
  To: Georg Sauthoff
  Cc: Stephen Hemminger, bruce.richardson, konstantin.ananyev, dev,
	Xiaoyun Li, Morten Brørup

Hi Georg,

> perhaps it's worth mentioning that the Linux Kernel is compiled with
> -fno-strict-aliasing, because it contains code which violates the C
> strict aliasing rules. Such code yields undefined behavior and is thus
> unsafe when compiling with optmizing compilers such as GCC and Clang, by
> default. But since the Linux supplies -fno-strict-aliasing its code is
> safe, in the Linux Kernel context.
>
> In contrast, DPDK isn't compiled with -fno-strict-aliasing, in general.
> Only a few drivers are built with -Wno-strict-aliasing.
>
> Thus, one has to be careful when importing (or being inspired) by the
> above mentioned kernel defines.
>
> Especially, it looks like version 5 of this patch yields undefined
> behavior because it violates strict-aliasing rules.
> It's possible that GCC makes some extra guarantees for the used
> constructs, even when compiling without -fno-strict-aliasing. But I'm
> not aware of any.

Thank you for pointing this out. That's an excellent comment and
something I had missed. Would adding the __may_alias__ attribute to
the unaligned_uint structs fix the issue?

I think it would, but I have to admit that documentation is a little
sparse and I don't have much experience with this attribute. If it
doesn't fix the problem, please let me know if you can think of
another solution that wouldn't involve going back to assembly (v4 of
this patch).

Also, I think the __may_alias__ attribute is supported by all
compilers that DPDK supports when targeting x86, but please let me
know if you don't think that's the case.

> BTW, are there still systems that DPDK supports but have a memcpy() which
> performs worse than rte_memcpy()?

I concur with Morten, that's an excellent question. Analyzing the
memcpy source code on every platform/architecture that DPDK supports
would be one way to answer the question. However, that seems like a
significant undertaking and I'm hoping someone who knows the answer
will chime in.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
@ 2022-02-04 16:42   ` Luc Pelletier
  2022-02-04 17:16   ` Ananyev, Konstantin
  1 sibling, 0 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-02-04 16:42 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev; +Cc: dev, Xiaoyun Li, dpdk stable

Hi,

It's been several days and I haven't received any additional feedback
on this patch (and the other patch related to this one:
http://mails.dpdk.org/archives/dev/2022-January/232491.html). I would
welcome any kind of feedback. Ideally, I'm hoping an authoritative
voice would indicate if there's any interest in updating rte_memcpy
and if so, comment on the different possible solutions presented thus
far in this thread.

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
  2022-02-04 16:42   ` Luc Pelletier
@ 2022-02-04 17:16   ` Ananyev, Konstantin
  2022-02-08 16:53     ` Thomas Monjalon
  2022-02-09 15:05     ` Luc Pelletier
  1 sibling, 2 replies; 33+ messages in thread
From: Ananyev, Konstantin @ 2022-02-04 17:16 UTC (permalink / raw)
  To: Luc Pelletier, Richardson, Bruce; +Cc: dev, Li, Xiaoyun, stable


> Calls to rte_memcpy_generic could result in unaligned loads/stores for
> 1 < n < 16. This is undefined behavior according to the C standard,
> and it gets flagged by the clang undefined behavior sanitizer.
> 
> rte_memcpy_generic is called with unaligned src and dst addresses.
> When 1 < n < 16, the code would cast both src and dst to a qword,
> dword or word pointer, without verifying the alignment of src/dst. The
> code was changed to use a packed structure to perform the unaligned
> load/store operations. This results in unaligned load/store operations
> to be C standards-compliant.

Still not sure we need to fix that:
This is x86 specific code-path, and as I remember on x86 there are no
penalties for unaligned access to 2/4/8 byte values. 
Though I like introduction of rte_mov15_or_less() function -t helps
with code dedup. 

> 
> Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> Cc: Xiaoyun Li <xiaoyun.li@intel.com>
> Cc: stable@dpdk.org
> 
> Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> ---
> 
> Thanks to Stephen's pointer to look at the linux kernel, I was able to
> find a way to perform the unaligned load/store using pure C code. The
> new functions added to perform the load/store could likely be moved to a
> different file and the code duplication could likely be eliminated by
> using a macro. However, I will hold off on making these changes until I
> get confirmation from maintainers that this technique is acceptable and
> this is what we want to move forward with.
> 
>  lib/eal/x86/include/rte_memcpy.h | 142 +++++++++++++++++--------------
>  1 file changed, 80 insertions(+), 62 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..4e876d39eb 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -45,6 +45,83 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
> 
> +static __rte_always_inline uint64_t
> +rte_load_unaligned_uint64(const void *ptr)
> +{
> +	struct unaligned_uint64 { uint64_t val; } __rte_packed;
> +	return ((const struct unaligned_uint64 *)ptr)->val;
> +}
> +
> +static __rte_always_inline uint32_t
> +rte_load_unaligned_uint32(const void *ptr)
> +{
> +	struct unaligned_uint32 { uint32_t val; } __rte_packed;
> +	return ((const struct unaligned_uint32 *)ptr)->val;
> +}
> +
> +static __rte_always_inline uint16_t
> +rte_load_unaligned_uint16(const void *ptr)
> +{
> +	struct unaligned_uint16 { uint16_t val; } __rte_packed;
> +	return ((const struct unaligned_uint16 *)ptr)->val;
> +}
> +
> +static __rte_always_inline void
> +rte_store_unaligned_uint64(void *ptr, uint64_t val)
> +{
> +	struct unaligned_uint64 { uint64_t val; } __rte_packed;
> +	((struct unaligned_uint64 *)ptr)->val = val;
> +}
> +
> +static __rte_always_inline void
> +rte_store_unaligned_uint32(void *ptr, uint32_t val)
> +{
> +	struct unaligned_uint32 { uint32_t val; } __rte_packed;
> +	((struct unaligned_uint32 *)ptr)->val = val;
> +}
> +
> +static __rte_always_inline void
> +rte_store_unaligned_uint16(void *ptr, uint16_t val)
> +{
> +	struct unaligned_uint16 { uint16_t val; } __rte_packed;
> +	((struct unaligned_uint16 *)ptr)->val = val;
> +}
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with unaligned src/dst, and n <= 15.
> + */
> +static __rte_always_inline void *
> +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
> +{
> +	void *ret = dst;
> +	if (n & 8) {
> +		rte_store_unaligned_uint64(
> +			dst,
> +			rte_load_unaligned_uint64(src));
> +		src = ((const uint64_t *)src + 1);
> +		dst = ((uint64_t *)dst + 1);
> +	}
> +	if (n & 4) {
> +		rte_store_unaligned_uint32(
> +			dst,
> +			rte_load_unaligned_uint32(src));
> +		src = ((const uint32_t *)src + 1);
> +		dst = ((uint32_t *)dst + 1);
> +	}
> +	if (n & 2) {
> +		rte_store_unaligned_uint16(
> +			dst,
> +			rte_load_unaligned_uint16(src));
> +		src = ((const uint16_t *)src + 1);
> +		dst = ((uint16_t *)dst + 1);
> +	}
> +	if (n & 1)
> +		*(uint8_t *)dst = *(const uint8_t *)src;
> +	return ret;
> +}
> +
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> 
>  #define ALIGNMENT_MASK 0x3F
> @@ -171,8 +248,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> -	uintptr_t dstu = (uintptr_t)dst;
> -	uintptr_t srcu = (uintptr_t)src;
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
> @@ -181,24 +256,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Copy less than 16 bytes
>  	 */
>  	if (n < 16) {
> -		if (n & 0x01) {
> -			*(uint8_t *)dstu = *(const uint8_t *)srcu;
> -			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -		}
> -		if (n & 0x02) {
> -			*(uint16_t *)dstu = *(const uint16_t *)srcu;
> -			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -		}
> -		if (n & 0x04) {
> -			*(uint32_t *)dstu = *(const uint32_t *)srcu;
> -			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -		}
> -		if (n & 0x08)
> -			*(uint64_t *)dstu = *(const uint64_t *)srcu;
> -		return ret;
> +		return rte_mov15_or_less_unaligned(dst, src, n);
>  	}
> 
>  	/**
> @@ -379,8 +437,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> -	uintptr_t dstu = (uintptr_t)dst;
> -	uintptr_t srcu = (uintptr_t)src;
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
> @@ -389,25 +445,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Copy less than 16 bytes
>  	 */
>  	if (n < 16) {
> -		if (n & 0x01) {
> -			*(uint8_t *)dstu = *(const uint8_t *)srcu;
> -			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -		}
> -		if (n & 0x02) {
> -			*(uint16_t *)dstu = *(const uint16_t *)srcu;
> -			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -		}
> -		if (n & 0x04) {
> -			*(uint32_t *)dstu = *(const uint32_t *)srcu;
> -			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -		}
> -		if (n & 0x08) {
> -			*(uint64_t *)dstu = *(const uint64_t *)srcu;
> -		}
> -		return ret;
> +		return rte_mov15_or_less_unaligned(dst, src, n);
>  	}
> 
>  	/**
> @@ -672,8 +710,6 @@ static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
>  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> -	uintptr_t dstu = (uintptr_t)dst;
> -	uintptr_t srcu = (uintptr_t)src;
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t srcofs;
> @@ -682,25 +718,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Copy less than 16 bytes
>  	 */
>  	if (n < 16) {
> -		if (n & 0x01) {
> -			*(uint8_t *)dstu = *(const uint8_t *)srcu;
> -			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -		}
> -		if (n & 0x02) {
> -			*(uint16_t *)dstu = *(const uint16_t *)srcu;
> -			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -		}
> -		if (n & 0x04) {
> -			*(uint32_t *)dstu = *(const uint32_t *)srcu;
> -			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -			dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -		}
> -		if (n & 0x08) {
> -			*(uint64_t *)dstu = *(const uint64_t *)srcu;
> -		}
> -		return ret;
> +		return rte_mov15_or_less_unaligned(dst, src, n);
>  	}
> 
>  	/**
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-04 17:16   ` Ananyev, Konstantin
@ 2022-02-08 16:53     ` Thomas Monjalon
  2022-02-09 15:05     ` Luc Pelletier
  1 sibling, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2022-02-08 16:53 UTC (permalink / raw)
  To: Richardson, Bruce, Li, Xiaoyun, Ananyev, Konstantin
  Cc: Luc Pelletier, dev, stable, ferruh.yigit, olivier.matz, stephen

04/02/2022 18:16, Ananyev, Konstantin:
> 
> > Calls to rte_memcpy_generic could result in unaligned loads/stores for
> > 1 < n < 16. This is undefined behavior according to the C standard,
> > and it gets flagged by the clang undefined behavior sanitizer.
> > 
> > rte_memcpy_generic is called with unaligned src and dst addresses.
> > When 1 < n < 16, the code would cast both src and dst to a qword,
> > dword or word pointer, without verifying the alignment of src/dst. The
> > code was changed to use a packed structure to perform the unaligned
> > load/store operations. This results in unaligned load/store operations
> > to be C standards-compliant.
> 
> Still not sure we need to fix that:
> This is x86 specific code-path, and as I remember on x86 there are no
> penalties for unaligned access to 2/4/8 byte values. 
> Though I like introduction of rte_mov15_or_less() function -t helps
> with code dedup. 

Any more opinions?




^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-04 17:16   ` Ananyev, Konstantin
  2022-02-08 16:53     ` Thomas Monjalon
@ 2022-02-09 15:05     ` Luc Pelletier
  2022-02-10 14:04       ` Ananyev, Konstantin
  1 sibling, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-02-09 15:05 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable

> > Calls to rte_memcpy_generic could result in unaligned loads/stores for
> > 1 < n < 16. This is undefined behavior according to the C standard,
> > and it gets flagged by the clang undefined behavior sanitizer.
> >
> > rte_memcpy_generic is called with unaligned src and dst addresses.
> > When 1 < n < 16, the code would cast both src and dst to a qword,
> > dword or word pointer, without verifying the alignment of src/dst. The
> > code was changed to use a packed structure to perform the unaligned
> > load/store operations. This results in unaligned load/store operations
> > to be C standards-compliant.
>
> Still not sure we need to fix that:
> This is x86 specific code-path, and as I remember on x86 there are no
> penalties for unaligned access to 2/4/8 byte values.
> Though I like introduction of rte_mov15_or_less() function -t helps
> with code dedup.

Thanks for your input Konstantin. Much appreciated. Just to make sure
I understand, can you please confirm that we do not want to fix the
fact that unaligned access in C is undefined behaviour? I understand
that X86 allows unaligned accesses but since this isn't assembly code,
we have to go by what the C standard allows.

Also, do you have any opinion on the strict aliasing violation (as was
pointed out by Georg)? I suggested adding __may_alias__ thinking it
would likely fix the issue, but I'd really like to get confirmation
from someone who has better knowledge of how the attribute works
exactly.

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-09 15:05     ` Luc Pelletier
@ 2022-02-10 14:04       ` Ananyev, Konstantin
  2022-02-10 16:56         ` Luc Pelletier
  0 siblings, 1 reply; 33+ messages in thread
From: Ananyev, Konstantin @ 2022-02-10 14:04 UTC (permalink / raw)
  To: Luc Pelletier; +Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable

Hi Luc,

> > > Calls to rte_memcpy_generic could result in unaligned loads/stores for
> > > 1 < n < 16. This is undefined behavior according to the C standard,
> > > and it gets flagged by the clang undefined behavior sanitizer.
> > >
> > > rte_memcpy_generic is called with unaligned src and dst addresses.
> > > When 1 < n < 16, the code would cast both src and dst to a qword,
> > > dword or word pointer, without verifying the alignment of src/dst. The
> > > code was changed to use a packed structure to perform the unaligned
> > > load/store operations. This results in unaligned load/store operations
> > > to be C standards-compliant.
> >
> > Still not sure we need to fix that:
> > This is x86 specific code-path, and as I remember on x86 there are no
> > penalties for unaligned access to 2/4/8 byte values.
> > Though I like introduction of rte_mov15_or_less() function -t helps
> > with code dedup.
> 
> Thanks for your input Konstantin. Much appreciated. Just to make sure
> I understand, can you please confirm that we do not want to fix the
> fact that unaligned access in C is undefined behaviour?

Yes, I don't think it is a real problem in that particular case.

> I understand
> that X86 allows unaligned accesses but since this isn't assembly code,
> we have to go by what the C standard allows.

> Also, do you have any opinion on the strict aliasing violation (as was
> pointed out by Georg)? I suggested adding __may_alias__ thinking it
> would likely fix the issue, but I'd really like to get confirmation
> from someone who has better knowledge of how the attribute works
> exactly.

Not sure I understand the problem you are referring to.
Are you saying that original rte_memcpy() code breaks strict aliasing? 
If so, could you point where exactly?


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-10 14:04       ` Ananyev, Konstantin
@ 2022-02-10 16:56         ` Luc Pelletier
  2022-02-11 15:51           ` Ananyev, Konstantin
  0 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-02-10 16:56 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable

Hi Konstantin,

> > Thanks for your input Konstantin. Much appreciated. Just to make sure
> > I understand, can you please confirm that we do not want to fix the
> > fact that unaligned access in C is undefined behaviour?
>
> Yes, I don't think it is a real problem in that particular case.

Perfect. Thank you.

> > Also, do you have any opinion on the strict aliasing violation (as was
> > pointed out by Georg)? I suggested adding __may_alias__ thinking it
> > would likely fix the issue, but I'd really like to get confirmation
> > from someone who has better knowledge of how the attribute works
> > exactly.
>
> Not sure I understand the problem you are referring to.
> Are you saying that original rte_memcpy() code breaks strict aliasing?
> If so, could you point where exactly?

As far as I understand, yes, it does break strict aliasing. For
example, in the following line:

*(uint64_t *)dstu = *(const uint64_t *)srcu;

IIUC, both casts break strict aliasing rules. While the src/dst
parameters are void* and can therefore be cast to something else
without breaking strict aliasing rules, the type of src/dst in the
calling code might be something other than uint64_t*. This can result
in src/dst pointers being cast to different unrelated types. AFAICT,
the fact that rte_memcpy is "always inline" increases the risk of the
compiler making an optimization that results in broken code.

I was able to come up with an example where the latest version of GCC
produces broken code when strict aliasing is enabled:

https://godbolt.org/z/3Yzvjr97c

With -fstrict-aliasing, it reorders a write and results in broken
code. If you change the compiler flags to -fno-strict-aliasing, it
produces the expected result.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-10 16:56         ` Luc Pelletier
@ 2022-02-11 15:51           ` Ananyev, Konstantin
  2022-02-13 22:31             ` Luc Pelletier
  0 siblings, 1 reply; 33+ messages in thread
From: Ananyev, Konstantin @ 2022-02-11 15:51 UTC (permalink / raw)
  To: Luc Pelletier; +Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable

> > Not sure I understand the problem you are referring to.
> > Are you saying that original rte_memcpy() code breaks strict aliasing?
> > If so, could you point where exactly?
> 
> As far as I understand, yes, it does break strict aliasing. For
> example, in the following line:
> 
> *(uint64_t *)dstu = *(const uint64_t *)srcu;
> 
> IIUC, both casts break strict aliasing rules. While the src/dst
> parameters are void* and can therefore be cast to something else
> without breaking strict aliasing rules, the type of src/dst in the
> calling code might be something other than uint64_t*. This can result
> in src/dst pointers being cast to different unrelated types. AFAICT,
> the fact that rte_memcpy is "always inline" increases the risk of the
> compiler making an optimization that results in broken code.
> 
> I was able to come up with an example where the latest version of GCC
> produces broken code when strict aliasing is enabled:
> 
> https://godbolt.org/z/3Yzvjr97c
> 
> With -fstrict-aliasing, it reorders a write and results in broken
> code. If you change the compiler flags to -fno-strict-aliasing, it
> produces the expected result.

Indeed it looks like a problem.
Thanks for pointing it out.
Was able to reproduce it with gcc 11 (clang 13 seems fine). 
Actually, adding ' __attribute__ ((__may_alias__))' for both dst and src 
didn't quire the problem.
To overcome it, I had to either:
add '-fno-strict-aliasing' CC flag (as you mentioned above),
or add:
if (__builtin_constant_p(n))
            return memcpy(dst, src, n);

on top of rte_memcpy() code.

Though I suppose the problem might be much wider than just rte_memcpy().
We do have similar inline copying code in other places too.
As understand some of such cases also might be affected.
Let say: '_rte_ring_(enqueue|dequeue_elems_*'.
Not sure what would be the best approach in general for such cases:
- always compile DPDK code with '-fno-strict-aliasing'
  But that wouldn't prevent people to use our inline functions without that flag.
  Also wonder what performance impact it will have.
- Try to fix all such occurrences manually (but it would be hard to catch all of them upfront)
- Something else ...?

Wonder what do people think about it?
Konstantin     



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-11 15:51           ` Ananyev, Konstantin
@ 2022-02-13 22:31             ` Luc Pelletier
  2022-02-14 13:41               ` Ananyev, Konstantin
  0 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-02-13 22:31 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable

Hi Konstantin,

> Indeed it looks like a problem.
> Thanks for pointing it out.
> Was able to reproduce it with gcc 11 (clang 13 seems fine).
> Actually, adding ' __attribute__ ((__may_alias__))' for both dst and src
> didn't quire the problem.

__may_alias__ works if it's applied to a typedef, see the following
for a modified version of my original example that works and uses
__may_alias__:

https://godbolt.org/z/W83zzoePq

The documentation I found for __may_alias__ is quite sparse, so I'm a
little wary of assuming it'll always work. I'm hoping someone with
more experience with the attribute would be able to add more
confidence to the assumption that it'll work in all cases.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
  2022-02-13 22:31             ` Luc Pelletier
@ 2022-02-14 13:41               ` Ananyev, Konstantin
  0 siblings, 0 replies; 33+ messages in thread
From: Ananyev, Konstantin @ 2022-02-14 13:41 UTC (permalink / raw)
  To: Luc Pelletier
  Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable, stephen, Yigit,
	Ferruh, Morten Brørup, Georg Sauthoff


Hi Luc,
 
> > Indeed it looks like a problem.
> > Thanks for pointing it out.
> > Was able to reproduce it with gcc 11 (clang 13 seems fine).
> > Actually, adding ' __attribute__ ((__may_alias__))' for both dst and src
> > didn't quire the problem.
> 
> __may_alias__ works if it's applied to a typedef, see the following
> for a modified version of my original example that works and uses
> __may_alias__:
> 
> https://godbolt.org/z/W83zzoePq

Interesting, I played with '__may_alias__' attribute a bit more,
and yes, as you said, it seems to wok properly only with typedefs.
If so, then we probably can add a new typedefs (in rte_common.h or so),
and use it in rte_memcpy and probably few other places, where we
think it will be necessary. 
 
> The documentation I found for __may_alias__ is quite sparse, so I'm a
> little wary of assuming it'll always work.
> I'm hoping someone with
> more experience with the attribute would be able to add more
> confidence to the assumption that it'll work in all cases.

Agree, would like to see more input too.
CC-ing to other guys, who seemed to be interested in that discussion before.

Thanks
Konstantin



^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v6] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
                   ` (3 preceding siblings ...)
  2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
@ 2022-02-25 15:51 ` Luc Pelletier
  2022-02-25 16:38 ` [PATCH v7] " Luc Pelletier
  5 siblings, 0 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-02-25 15:51 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy for 1 < n < 16 could result in unaligned
loads/stores, which is undefined behaviour according to the C
standard, and strict aliasing violations.

The code was changed to use a packed structure that allows aliasing
(using the __may_alias__ attribute) to perform the load/store
operations. This results in code that has the same performance as the
original code and that is also C standard-compliant.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---
v6:
* Refocus to fix strict aliasing problems discovered following
discussions in this thread.
* Modified the code to use __may_alias__ and packed structure. This fixes
both the undefined behaviour of unaligned access (which is not the main
concern), and also fixes the strict aliasing violations (which can cause
major bugs, as demonstrated in a previous message in this thread).
* Renamed new function from rte_mov15_or_less_unaligned to
rte_mov15_or_less.
* Modified code that copies <= 15 bytes to call rte_mov15_or_less.

v5:
* Replaced assembly with pure C code that uses a packed structure to make
unaligned loads conform to C standard.

v4:
* Added volatile to asm statements, which is required under gcc.

v3:
* Removed for loop and went back to using assembly.

v2:
* Replaced assembly with a regular for loop that copies bytes one by
one.

v1:
* Fix undefined behaviour of unaligned stores/loads by using assembly
to perform stores/loads.

 lib/eal/x86/include/rte_memcpy.h | 133 ++++++++++++-------------------
 1 file changed, 51 insertions(+), 82 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..360b7e069e 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,52 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less(void *dst, const void *src, size_t n)
+{
+	/**
+	 * Use the following structs to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
+	struct rte_uint64_alias {
+		uint64_t val;
+	} __attribute__((__may_alias__, __packed__));
+	struct rte_uint32_alias {
+		uint32_t val;
+	} __attribute__((__may_alias__, __packed__));
+	struct rte_uint16_alias {
+		uint16_t val;
+	} __attribute__((__may_alias__, __packed__));
+
+	void *ret = dst;
+	if (n & 8) {
+		((struct rte_uint64_alias *)dst)->val =
+			((const struct rte_uint64_alias *)src)->val;
+		src = (const uint64_t *)src + 1;
+		dst = (uint64_t *)dst + 1;
+	}
+	if (n & 4) {
+		((struct rte_uint32_alias *)dst)->val =
+			((const struct rte_uint32_alias *)src)->val;
+		src = (const uint32_t *)src + 1;
+		dst = (uint32_t *)dst + 1;
+	}
+	if (n & 2) {
+		((struct rte_uint16_alias *)dst)->val =
+			((const struct rte_uint16_alias *)src)->val;
+		src = (const uint16_t *)src + 1;
+		dst = (uint16_t *)dst + 1;
+	}
+	if (n & 1)
+		*(uint8_t *)dst = *(const uint8_t *)src;
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +217,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +225,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -379,8 +406,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +414,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -672,8 +679,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +687,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -818,27 +805,9 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size <= 16 bytes */
+	/* Copy size < 16 bytes */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dst = *(const uint8_t *)src;
-			src = (const uint8_t *)src + 1;
-			dst = (uint8_t *)dst + 1;
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dst = *(const uint16_t *)src;
-			src = (const uint16_t *)src + 1;
-			dst = (uint16_t *)dst + 1;
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dst = *(const uint32_t *)src;
-			src = (const uint32_t *)src + 1;
-			dst = (uint32_t *)dst + 1;
-		}
-		if (n & 0x08)
-			*(uint64_t *)dst = *(const uint64_t *)src;
-
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
-- 
2.35.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
                   ` (4 preceding siblings ...)
  2022-02-25 15:51 ` [PATCH v6] eal: fix rte_memcpy strict aliasing/alignment bugs Luc Pelletier
@ 2022-02-25 16:38 ` Luc Pelletier
  2022-03-10 14:55   ` Ananyev, Konstantin
  5 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-02-25 16:38 UTC (permalink / raw)
  To: bruce.richardson, konstantin.ananyev
  Cc: dev, Luc Pelletier, Xiaoyun Li, stable

Calls to rte_memcpy for 1 < n < 16 could result in unaligned
loads/stores, which is undefined behaviour according to the C
standard, and strict aliasing violations.

The code was changed to use a packed structure that allows aliasing
(using the __may_alias__ attribute) to perform the load/store
operations. This results in code that has the same performance as the
original code and that is also C standards-compliant.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---
v7:
* Fix coding style issue by adding new __rte_may_alias macro rather
than directly use __attribute__

v6:
* Refocus to fix strict aliasing problems discovered following
discussions in this thread.
* Modified the code to use __may_alias__ and packed structure. This fixes
both the undefined behaviour of unaligned access (which is not the main
concern), and also fixes the strict aliasing violations (which can cause
major bugs, as demonstrated in a previous message in this thread).
* Renamed new function from rte_mov15_or_less_unaligned to
rte_mov15_or_less.
* Modified code that copies <= 15 bytes to call rte_mov15_or_less.

v5:
* Replaced assembly with pure C code that uses a packed structure to make
unaligned loads conform to C standard.

v4:
* Added volatile to asm statements, which is required under gcc.

v3:
* Removed for loop and went back to using assembly.

v2:
* Replaced assembly with a regular for loop that copies bytes one by
one.

v1:
* Fix undefined behaviour of unaligned stores/loads by using assembly
to perform stores/loads.

 lib/eal/include/rte_common.h     |   5 ++
 lib/eal/x86/include/rte_memcpy.h | 133 ++++++++++++-------------------
 2 files changed, 56 insertions(+), 82 deletions(-)

diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
index 4a399cc7c8..2f1ec69f3d 100644
--- a/lib/eal/include/rte_common.h
+++ b/lib/eal/include/rte_common.h
@@ -85,6 +85,11 @@ typedef uint16_t unaligned_uint16_t;
  */
 #define __rte_packed __attribute__((__packed__))
 
+/**
+ * Macro to mark a type that is not subject to type-based aliasing rules
+ */
+#define __rte_may_alias __attribute__((__may_alias__))
+
 /******* Macro to mark functions and fields scheduled for removal *****/
 #define __rte_deprecated	__attribute__((__deprecated__))
 #define __rte_deprecated_msg(msg)	__attribute__((__deprecated__(msg)))
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..18aa4e43a7 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,52 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less(void *dst, const void *src, size_t n)
+{
+	/**
+	 * Use the following structs to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
+	struct rte_uint64_alias {
+		uint64_t val;
+	} __rte_packed __rte_may_alias;
+	struct rte_uint32_alias {
+		uint32_t val;
+	} __rte_packed __rte_may_alias;
+	struct rte_uint16_alias {
+		uint16_t val;
+	} __rte_packed __rte_may_alias;
+
+	void *ret = dst;
+	if (n & 8) {
+		((struct rte_uint64_alias *)dst)->val =
+			((const struct rte_uint64_alias *)src)->val;
+		src = (const uint64_t *)src + 1;
+		dst = (uint64_t *)dst + 1;
+	}
+	if (n & 4) {
+		((struct rte_uint32_alias *)dst)->val =
+			((const struct rte_uint32_alias *)src)->val;
+		src = (const uint32_t *)src + 1;
+		dst = (uint32_t *)dst + 1;
+	}
+	if (n & 2) {
+		((struct rte_uint16_alias *)dst)->val =
+			((const struct rte_uint16_alias *)src)->val;
+		src = (const uint16_t *)src + 1;
+		dst = (uint16_t *)dst + 1;
+	}
+	if (n & 1)
+		*(uint8_t *)dst = *(const uint8_t *)src;
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +217,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +225,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -379,8 +406,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +414,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -672,8 +679,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +687,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -818,27 +805,9 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size <= 16 bytes */
+	/* Copy size < 16 bytes */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dst = *(const uint8_t *)src;
-			src = (const uint8_t *)src + 1;
-			dst = (uint8_t *)dst + 1;
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dst = *(const uint16_t *)src;
-			src = (const uint16_t *)src + 1;
-			dst = (uint16_t *)dst + 1;
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dst = *(const uint32_t *)src;
-			src = (const uint32_t *)src + 1;
-			dst = (uint32_t *)dst + 1;
-		}
-		if (n & 0x08)
-			*(uint64_t *)dst = *(const uint64_t *)src;
-
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
-- 
2.35.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-02-25 16:38 ` [PATCH v7] " Luc Pelletier
@ 2022-03-10 14:55   ` Ananyev, Konstantin
  2022-04-07 15:24     ` David Marchand
  0 siblings, 1 reply; 33+ messages in thread
From: Ananyev, Konstantin @ 2022-03-10 14:55 UTC (permalink / raw)
  To: Luc Pelletier, Richardson, Bruce; +Cc: dev, Li, Xiaoyun, stable


> Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> loads/stores, which is undefined behaviour according to the C
> standard, and strict aliasing violations.
> 
> The code was changed to use a packed structure that allows aliasing
> (using the __may_alias__ attribute) to perform the load/store
> operations. This results in code that has the same performance as the
> original code and that is also C standards-compliant.
> 
> Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> Cc: Xiaoyun Li <xiaoyun.li@intel.com>
> Cc: stable@dpdk.org
> 
> Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> ---
> v7:
> * Fix coding style issue by adding new __rte_may_alias macro rather
> than directly use __attribute__
> 
> v6:
> * Refocus to fix strict aliasing problems discovered following
> discussions in this thread.
> * Modified the code to use __may_alias__ and packed structure. This fixes
> both the undefined behaviour of unaligned access (which is not the main
> concern), and also fixes the strict aliasing violations (which can cause
> major bugs, as demonstrated in a previous message in this thread).
> * Renamed new function from rte_mov15_or_less_unaligned to
> rte_mov15_or_less.
> * Modified code that copies <= 15 bytes to call rte_mov15_or_less.
> 
> v5:
> * Replaced assembly with pure C code that uses a packed structure to make
> unaligned loads conform to C standard.
> 
> v4:
> * Added volatile to asm statements, which is required under gcc.
> 
> v3:
> * Removed for loop and went back to using assembly.
> 
> v2:
> * Replaced assembly with a regular for loop that copies bytes one by
> one.
> 
> v1:
> * Fix undefined behaviour of unaligned stores/loads by using assembly
> to perform stores/loads.

LGTM, thanks for highlighting and fixing the problem.
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

As a side note, we probably need to check other similar places in DPDK code. 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-03-10 14:55   ` Ananyev, Konstantin
@ 2022-04-07 15:24     ` David Marchand
  2022-04-07 15:32       ` David Marchand
                         ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: David Marchand @ 2022-04-07 15:24 UTC (permalink / raw)
  To: Ananyev, Konstantin, Luc Pelletier
  Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable, Aaron Conole, Owen Hilyard

On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> > loads/stores, which is undefined behaviour according to the C
> > standard, and strict aliasing violations.
> >
> > The code was changed to use a packed structure that allows aliasing
> > (using the __may_alias__ attribute) to perform the load/store
> > operations. This results in code that has the same performance as the
> > original code and that is also C standards-compliant.
> >
> > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

Thanks, applied.


> As a side note, we probably need to check other similar places in DPDK code.

What would be the best way to detect those problematic places?

I tried UBsan, and it did report some of the issues fixed with this patch.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-04-07 15:24     ` David Marchand
@ 2022-04-07 15:32       ` David Marchand
  2022-04-07 15:40         ` David Marchand
  2022-04-08 13:47       ` Luc Pelletier
  2022-05-19 16:47       ` David Marchand
  2 siblings, 1 reply; 33+ messages in thread
From: David Marchand @ 2022-04-07 15:32 UTC (permalink / raw)
  To: Ananyev, Konstantin, Luc Pelletier
  Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable, Aaron Conole, Owen Hilyard

On Thu, Apr 7, 2022 at 5:24 PM David Marchand <david.marchand@redhat.com> wrote:
>
> On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> > > loads/stores, which is undefined behaviour according to the C
> > > standard, and strict aliasing violations.
> > >
> > > The code was changed to use a packed structure that allows aliasing
> > > (using the __may_alias__ attribute) to perform the load/store
> > > operations. This results in code that has the same performance as the
> > > original code and that is also C standards-compliant.
> > >
> > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")

Actually, looking again at the history, it fixes:
Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")

I'll change before pushing.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-04-07 15:32       ` David Marchand
@ 2022-04-07 15:40         ` David Marchand
  2022-05-13 19:15           ` Luc Pelletier
  0 siblings, 1 reply; 33+ messages in thread
From: David Marchand @ 2022-04-07 15:40 UTC (permalink / raw)
  To: Ananyev, Konstantin, Luc Pelletier
  Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable, Aaron Conole, Owen Hilyard

On Thu, Apr 7, 2022 at 5:32 PM David Marchand <david.marchand@redhat.com> wrote:
>
> On Thu, Apr 7, 2022 at 5:24 PM David Marchand <david.marchand@redhat.com> wrote:
> >
> > On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin
> > <konstantin.ananyev@intel.com> wrote:
> > > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> > > > loads/stores, which is undefined behaviour according to the C
> > > > standard, and strict aliasing violations.
> > > >
> > > > The code was changed to use a packed structure that allows aliasing
> > > > (using the __may_alias__ attribute) to perform the load/store
> > > > operations. This results in code that has the same performance as the
> > > > original code and that is also C standards-compliant.
> > > >
> > > > Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
>
> Actually, looking again at the history, it fixes:
> Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")

Nop, that's probably even older, could you double check?
I'll hold on pushing this fix.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-04-07 15:24     ` David Marchand
  2022-04-07 15:32       ` David Marchand
@ 2022-04-08 13:47       ` Luc Pelletier
  2022-05-19 16:47       ` David Marchand
  2 siblings, 0 replies; 33+ messages in thread
From: Luc Pelletier @ 2022-04-08 13:47 UTC (permalink / raw)
  To: David Marchand
  Cc: Ananyev, Konstantin, Richardson, Bruce, dev, Li, Xiaoyun, stable,
	Aaron Conole, Owen Hilyard

Hi David,

Le jeu. 7 avr. 2022 à 11:24, David Marchand
<david.marchand@redhat.com> a écrit :

> > As a side note, we probably need to check other similar places in DPDK code.
>
> What would be the best way to detect those problematic places?

As far as I'm aware, there is no silver bullet to detect all strict
aliasing violations. A good summary of the different options are
described here:

https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#catching-strict-aliasing-violations

However, this is not 100% and might still miss some strict aliasing violations.

The bottom line is that anywhere there's a cast to something other
than void* or char*, it could be a strict aliasing violation. So,
doing an exhaustive search throughout the code base for casts seems
like the only (tedious, time-consuming) solution.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-04-07 15:40         ` David Marchand
@ 2022-05-13 19:15           ` Luc Pelletier
  2022-05-19 16:41             ` David Marchand
  0 siblings, 1 reply; 33+ messages in thread
From: Luc Pelletier @ 2022-05-13 19:15 UTC (permalink / raw)
  To: David Marchand
  Cc: Ananyev, Konstantin, Richardson, Bruce, dev, Li, Xiaoyun, stable,
	Aaron Conole, Owen Hilyard

Hi David,

> > Actually, looking again at the history, it fixes:
> > Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")
>
> Nop, that's probably even older, could you double check?
> I'll hold on pushing this fix.

It seems you still haven't received a response. I'll take a stab at this.

I think it fixes:

Fixes: 76746eb13f08 ("eal/x86: fix strict aliasing rules")

However, the ordering of the ifs (from 1 to 8, rather than 8 to 1)
seems to date back to the first commit (af75078fece3). So, I'm not
sure how you want to handle this.

Thanks.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-05-13 19:15           ` Luc Pelletier
@ 2022-05-19 16:41             ` David Marchand
  0 siblings, 0 replies; 33+ messages in thread
From: David Marchand @ 2022-05-19 16:41 UTC (permalink / raw)
  To: Luc Pelletier
  Cc: Ananyev, Konstantin, Richardson, Bruce, dev, Li, Xiaoyun, stable,
	Aaron Conole, Owen Hilyard

Hello Luc,

On Fri, May 13, 2022 at 9:16 PM Luc Pelletier <lucp.at.work@gmail.com> wrote:
> > > Actually, looking again at the history, it fixes:
> > > Fixes: f5472703c0bd ("eal: optimize aligned memcpy on x86")
> >
> > Nop, that's probably even older, could you double check?
> > I'll hold on pushing this fix.
>
> It seems you still haven't received a response. I'll take a stab at this.
>
> I think it fixes:
>
> Fixes: 76746eb13f08 ("eal/x86: fix strict aliasing rules")
>
> However, the ordering of the ifs (from 1 to 8, rather than 8 to 1)
> seems to date back to the first commit (af75078fece3). So, I'm not
> sure how you want to handle this.

My understanding was that the issue was there since "day 1".
I'll go with the latter then.

Backporting will be for all LTS branches in any case.

Thanks!


-- 
David Marchand


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
  2022-04-07 15:24     ` David Marchand
  2022-04-07 15:32       ` David Marchand
  2022-04-08 13:47       ` Luc Pelletier
@ 2022-05-19 16:47       ` David Marchand
  2 siblings, 0 replies; 33+ messages in thread
From: David Marchand @ 2022-05-19 16:47 UTC (permalink / raw)
  To: Luc Pelletier
  Cc: Richardson, Bruce, dev, Li, Xiaoyun, stable, Aaron Conole,
	Owen Hilyard, Ananyev, Konstantin

On Thu, Apr 7, 2022 at 5:24 PM David Marchand <david.marchand@redhat.com> wrote:
> On Thu, Mar 10, 2022 at 3:55 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > Calls to rte_memcpy for 1 < n < 16 could result in unaligned
> > > loads/stores, which is undefined behaviour according to the C
> > > standard, and strict aliasing violations.
> > >
> > > The code was changed to use a packed structure that allows aliasing
> > > (using the __may_alias__ attribute) to perform the load/store
> > > operations. This results in code that has the same performance as the
> > > original code and that is also C standards-compliant.
> > >

Fixes: af75078fece3 ("first public release")
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> > Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>

Applied, thanks Luc.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2022-05-19 16:47 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
2022-01-15 21:39 ` [PATCH v2] " Luc Pelletier
2022-01-15 22:13   ` Stephen Hemminger
2022-01-16 14:09     ` Luc Pelletier
2022-01-16 16:32       ` Stephen Hemminger
2022-01-24 23:21         ` Georg Sauthoff
2022-01-25  7:59           ` Morten Brørup
2022-01-25 19:57           ` Luc Pelletier
2022-01-16 14:13 ` [PATCH v3] " Luc Pelletier
2022-01-16 14:33   ` Luc Pelletier
2022-01-16 16:34     ` Stephen Hemminger
2022-01-16 17:59       ` Morten Brørup
2022-01-16 20:33 ` [PATCH v4] " Luc Pelletier
2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
2022-02-04 16:42   ` Luc Pelletier
2022-02-04 17:16   ` Ananyev, Konstantin
2022-02-08 16:53     ` Thomas Monjalon
2022-02-09 15:05     ` Luc Pelletier
2022-02-10 14:04       ` Ananyev, Konstantin
2022-02-10 16:56         ` Luc Pelletier
2022-02-11 15:51           ` Ananyev, Konstantin
2022-02-13 22:31             ` Luc Pelletier
2022-02-14 13:41               ` Ananyev, Konstantin
2022-02-25 15:51 ` [PATCH v6] eal: fix rte_memcpy strict aliasing/alignment bugs Luc Pelletier
2022-02-25 16:38 ` [PATCH v7] " Luc Pelletier
2022-03-10 14:55   ` Ananyev, Konstantin
2022-04-07 15:24     ` David Marchand
2022-04-07 15:32       ` David Marchand
2022-04-07 15:40         ` David Marchand
2022-05-13 19:15           ` Luc Pelletier
2022-05-19 16:41             ` David Marchand
2022-04-08 13:47       ` Luc Pelletier
2022-05-19 16:47       ` David Marchand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).