DPDK patches and discussions
 help / color / mirror / Atom feed
From: Luc Pelletier <lucp.at.work@gmail.com>
To: bruce.richardson@intel.com, konstantin.ananyev@intel.com
Cc: dev@dpdk.org, Luc Pelletier <lucp.at.work@gmail.com>,
	Xiaoyun Li <xiaoyun.li@intel.com>,
	stable@dpdk.org
Subject: [PATCH v7] eal: fix rte_memcpy strict aliasing/alignment bugs
Date: Fri, 25 Feb 2022 11:38:05 -0500	[thread overview]
Message-ID: <20220225163804.506142-1-lucp.at.work@gmail.com> (raw)
In-Reply-To: <20220115194102.444140-1-lucp.at.work@gmail.com>

Calls to rte_memcpy for 1 < n < 16 could result in unaligned
loads/stores, which is undefined behaviour according to the C
standard, and strict aliasing violations.

The code was changed to use a packed structure that allows aliasing
(using the __may_alias__ attribute) to perform the load/store
operations. This results in code that has the same performance as the
original code and that is also C standards-compliant.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---
v7:
* Fix coding style issue by adding new __rte_may_alias macro rather
than directly use __attribute__

v6:
* Refocus to fix strict aliasing problems discovered following
discussions in this thread.
* Modified the code to use __may_alias__ and packed structure. This fixes
both the undefined behaviour of unaligned access (which is not the main
concern), and also fixes the strict aliasing violations (which can cause
major bugs, as demonstrated in a previous message in this thread).
* Renamed new function from rte_mov15_or_less_unaligned to
rte_mov15_or_less.
* Modified code that copies <= 15 bytes to call rte_mov15_or_less.

v5:
* Replaced assembly with pure C code that uses a packed structure to make
unaligned loads conform to C standard.

v4:
* Added volatile to asm statements, which is required under gcc.

v3:
* Removed for loop and went back to using assembly.

v2:
* Replaced assembly with a regular for loop that copies bytes one by
one.

v1:
* Fix undefined behaviour of unaligned stores/loads by using assembly
to perform stores/loads.

 lib/eal/include/rte_common.h     |   5 ++
 lib/eal/x86/include/rte_memcpy.h | 133 ++++++++++++-------------------
 2 files changed, 56 insertions(+), 82 deletions(-)

diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
index 4a399cc7c8..2f1ec69f3d 100644
--- a/lib/eal/include/rte_common.h
+++ b/lib/eal/include/rte_common.h
@@ -85,6 +85,11 @@ typedef uint16_t unaligned_uint16_t;
  */
 #define __rte_packed __attribute__((__packed__))
 
+/**
+ * Macro to mark a type that is not subject to type-based aliasing rules
+ */
+#define __rte_may_alias __attribute__((__may_alias__))
+
 /******* Macro to mark functions and fields scheduled for removal *****/
 #define __rte_deprecated	__attribute__((__deprecated__))
 #define __rte_deprecated_msg(msg)	__attribute__((__deprecated__(msg)))
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..18aa4e43a7 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,52 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less(void *dst, const void *src, size_t n)
+{
+	/**
+	 * Use the following structs to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
+	struct rte_uint64_alias {
+		uint64_t val;
+	} __rte_packed __rte_may_alias;
+	struct rte_uint32_alias {
+		uint32_t val;
+	} __rte_packed __rte_may_alias;
+	struct rte_uint16_alias {
+		uint16_t val;
+	} __rte_packed __rte_may_alias;
+
+	void *ret = dst;
+	if (n & 8) {
+		((struct rte_uint64_alias *)dst)->val =
+			((const struct rte_uint64_alias *)src)->val;
+		src = (const uint64_t *)src + 1;
+		dst = (uint64_t *)dst + 1;
+	}
+	if (n & 4) {
+		((struct rte_uint32_alias *)dst)->val =
+			((const struct rte_uint32_alias *)src)->val;
+		src = (const uint32_t *)src + 1;
+		dst = (uint32_t *)dst + 1;
+	}
+	if (n & 2) {
+		((struct rte_uint16_alias *)dst)->val =
+			((const struct rte_uint16_alias *)src)->val;
+		src = (const uint16_t *)src + 1;
+		dst = (uint16_t *)dst + 1;
+	}
+	if (n & 1)
+		*(uint8_t *)dst = *(const uint8_t *)src;
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +217,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +225,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -379,8 +406,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +414,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -672,8 +679,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +687,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/**
@@ -818,27 +805,9 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size <= 16 bytes */
+	/* Copy size < 16 bytes */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dst = *(const uint8_t *)src;
-			src = (const uint8_t *)src + 1;
-			dst = (uint8_t *)dst + 1;
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dst = *(const uint16_t *)src;
-			src = (const uint16_t *)src + 1;
-			dst = (uint16_t *)dst + 1;
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dst = *(const uint32_t *)src;
-			src = (const uint32_t *)src + 1;
-			dst = (uint32_t *)dst + 1;
-		}
-		if (n & 0x08)
-			*(uint64_t *)dst = *(const uint64_t *)src;
-
-		return ret;
+		return rte_mov15_or_less(dst, src, n);
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
-- 
2.35.1


  parent reply	other threads:[~2022-02-25 16:40 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-15 19:41 [PATCH] eal: fix unaligned loads/stores in rte_memcpy_generic Luc Pelletier
2022-01-15 21:39 ` [PATCH v2] " Luc Pelletier
2022-01-15 22:13   ` Stephen Hemminger
2022-01-16 14:09     ` Luc Pelletier
2022-01-16 16:32       ` Stephen Hemminger
2022-01-24 23:21         ` Georg Sauthoff
2022-01-25  7:59           ` Morten Brørup
2022-01-25 19:57           ` Luc Pelletier
2022-01-16 14:13 ` [PATCH v3] " Luc Pelletier
2022-01-16 14:33   ` Luc Pelletier
2022-01-16 16:34     ` Stephen Hemminger
2022-01-16 17:59       ` Morten Brørup
2022-01-16 20:33 ` [PATCH v4] " Luc Pelletier
2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
2022-02-04 16:42   ` Luc Pelletier
2022-02-04 17:16   ` Ananyev, Konstantin
2022-02-08 16:53     ` Thomas Monjalon
2022-02-09 15:05     ` Luc Pelletier
2022-02-10 14:04       ` Ananyev, Konstantin
2022-02-10 16:56         ` Luc Pelletier
2022-02-11 15:51           ` Ananyev, Konstantin
2022-02-13 22:31             ` Luc Pelletier
2022-02-14 13:41               ` Ananyev, Konstantin
2022-02-25 15:51 ` [PATCH v6] eal: fix rte_memcpy strict aliasing/alignment bugs Luc Pelletier
2022-02-25 16:38 ` Luc Pelletier [this message]
2022-03-10 14:55   ` [PATCH v7] " Ananyev, Konstantin
2022-04-07 15:24     ` David Marchand
2022-04-07 15:32       ` David Marchand
2022-04-07 15:40         ` David Marchand
2022-05-13 19:15           ` Luc Pelletier
2022-05-19 16:41             ` David Marchand
2022-04-08 13:47       ` Luc Pelletier
2022-05-19 16:47       ` David Marchand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220225163804.506142-1-lucp.at.work@gmail.com \
    --to=lucp.at.work@gmail.com \
    --cc=bruce.richardson@intel.com \
    --cc=dev@dpdk.org \
    --cc=konstantin.ananyev@intel.com \
    --cc=stable@dpdk.org \
    --cc=xiaoyun.li@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).