From: Luc Pelletier <lucp.at.work@gmail.com>
To: bruce.richardson@intel.com, konstantin.ananyev@intel.com
Cc: dev@dpdk.org, Luc Pelletier <lucp.at.work@gmail.com>,
Xiaoyun Li <xiaoyun.li@intel.com>,
stable@dpdk.org
Subject: [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
Date: Sat, 15 Jan 2022 16:39:50 -0500 [thread overview]
Message-ID: <20220115213949.449313-1-lucp.at.work@gmail.com> (raw)
In-Reply-To: <20220115194102.444140-1-lucp.at.work@gmail.com>
Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.
rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use a for loop to copy the bytes one by one.
Experimentation on compiler explorer indicates that gcc 7+
(released in 2017) and clang 7+ (released in 2018) both optimize out the
for loop with the least number of memory loads and stores, if n is known
at compile-time. If n is only known at compile-time, gcc and clang have
different behaviour but they both seem to recognize that a memcpy is
being done. More recent versions of both gcc/clang seem to also produce
even more optimized results.
Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org
Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---
I forgot that code under x86 also needs to compile for 32-bit
(obviously). So, I did some more experimentation and replaced the
assembly code with a regular for loop. Explanations are in the updated
commit message. Experimentation was done on compiler explorer here:
https://godbolt.org/z/zK54rzPEn
lib/eal/x86/include/rte_memcpy.h | 82 ++++++++------------------------
1 file changed, 20 insertions(+), 62 deletions(-)
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..e422397e49 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,23 @@ extern "C" {
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+ void *ret = dst;
+ for (; n; n--) {
+ *((char *)dst) = *((const char *) src);
+ dst = ((char *)dst) + 1;
+ src = ((const char *)src) + 1;
+ }
+ return ret;
+}
+
#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
#define ALIGNMENT_MASK 0x3F
@@ -171,8 +188,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
@@ -181,24 +196,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08)
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
@@ -379,8 +377,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
@@ -389,25 +385,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08) {
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- }
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
@@ -672,8 +650,6 @@ static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t srcofs;
@@ -682,25 +658,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08) {
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- }
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
--
2.25.1
next prev parent reply other threads:[~2022-01-15 21:40 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-01-15 19:41 [PATCH] " Luc Pelletier
2022-01-15 21:39 ` Luc Pelletier [this message]
2022-01-15 22:13 ` [PATCH v2] " Stephen Hemminger
2022-01-16 14:09 ` Luc Pelletier
2022-01-16 16:32 ` Stephen Hemminger
2022-01-16 14:13 ` [PATCH v3] " Luc Pelletier
2022-01-16 14:33 ` Luc Pelletier
2022-01-16 16:34 ` Stephen Hemminger
2022-01-16 17:59 ` Morten Brørup
2022-01-16 20:33 ` [PATCH v4] " Luc Pelletier
2022-01-17 15:37 ` [PATCH v5] " Luc Pelletier
2022-02-04 16:42 ` Luc Pelletier
2022-02-04 17:16 ` Ananyev, Konstantin
2022-02-08 16:53 ` Thomas Monjalon
2022-02-09 15:05 ` Luc Pelletier
2022-02-10 14:04 ` Ananyev, Konstantin
2022-02-10 16:56 ` Luc Pelletier
2022-02-11 15:51 ` Ananyev, Konstantin
2022-02-13 22:31 ` Luc Pelletier
2022-02-14 13:41 ` Ananyev, Konstantin
2022-02-25 15:51 ` [PATCH v6] eal: fix rte_memcpy strict aliasing/alignment bugs Luc Pelletier
2022-02-25 16:38 ` [PATCH v7] " Luc Pelletier
2022-03-10 14:55 ` Ananyev, Konstantin
2022-04-07 15:24 ` David Marchand
2022-04-07 15:32 ` David Marchand
2022-04-07 15:40 ` David Marchand
2022-05-13 19:15 ` Luc Pelletier
2022-05-19 16:41 ` David Marchand
2022-04-08 13:47 ` Luc Pelletier
2022-05-19 16:47 ` David Marchand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220115213949.449313-1-lucp.at.work@gmail.com \
--to=lucp.at.work@gmail.com \
--cc=bruce.richardson@intel.com \
--cc=dev@dpdk.org \
--cc=konstantin.ananyev@intel.com \
--cc=stable@dpdk.org \
--cc=xiaoyun.li@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).