From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 4270BA04A2;
	Sun, 16 Jan 2022 21:35:21 +0100 (CET)
Received: from [217.70.189.124] (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id D28FA410F4;
	Sun, 16 Jan 2022 21:35:20 +0100 (CET)
Received: from mail-qk1-f178.google.com (mail-qk1-f178.google.com
 [209.85.222.178])
 by mails.dpdk.org (Postfix) with ESMTP id 8E53940683;
 Sun, 16 Jan 2022 21:35:19 +0100 (CET)
Received: by mail-qk1-f178.google.com with SMTP id o135so6208382qke.8;
 Sun, 16 Jan 2022 12:35:19 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112;
 h=from:to:cc:subject:date:message-id:in-reply-to:references
 :mime-version:content-transfer-encoding;
 bh=FA2WUItRM9MWMQ4uVGAYxwzzAcim/AnFQPeEVNyV9UU=;
 b=KS+VQCHJ7311x0eZtqOrtGS3DIyvpG9bKVHtFXAZbfEydBtZvtjROUoNH7ptYAxGwm
 E5T3FdeGC0yJ2HvBEr6MPNuS01WqSWhjUa+j+Aqv9mDwaOtDPoe+BhoakSU7Oyq/216U
 YO3phUhC/oQucWyDyHSoXxek/l5zvrYdIDn5X4y+iHJGAfN6+Qg6hMg8feupd41Pfa5p
 L4UjTcvBXf8yAYJ4Minp9Ebn24f3Y6IYja85GFOD2Kfi8+FVUtVwSfRibA0qomjTfDNV
 XqeF2/LBLAXg/hWL/Hb+WMQzD9Jx4vk69i85/iPvmU+o9FWYw3hA/Ako/KV9XloDjbc0
 ti8A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=FA2WUItRM9MWMQ4uVGAYxwzzAcim/AnFQPeEVNyV9UU=;
 b=sKURTf2hf5pUOBvr6B42CYPJrYxR7uA61tTQOO4mYnjUKQAVRuf+XnVDSnBr9T7ksD
 NbhBdoHPNCV4Vbf4of+nEKH4Ag40DstO1v1Lz7OZ/fREYOUC5aSFChMyXunEeWg/SfE7
 nkysNCsSLCG+NwEBciGgir/vAIWihRdwSgCNd6tHaOVw8iIeOk/Nti4NI/SCn3NHB7tq
 08cUyPN8g0oQu2t+JbJNsQxruxTM0IrOL1gDCGjJJgGqW8DAATghBJo8vWH98ESUdunE
 iszZbK0ebtmYQpMThEMgGVFKITTFAjopfwOm9kltOAE+zx+Ucqj5ok/JVk48eEY2DLgb
 on2g==
X-Gm-Message-State: AOAM530BH0KGkN3s/SBRvzs93Zn8moi7nxV5yMtvRFCcbkkpcjJ1rDhS
 Fh9KKYH5sFsG9tHRzOgrVwg=
X-Google-Smtp-Source: ABdhPJw5kYXHArWnRpYMmrvcZixRHqrW+IdogRHDQN5oQ6yto7lkaSM4hISmlBSUEN87EGmzTKFCfg==
X-Received: by 2002:a37:6c6:: with SMTP id 189mr8821039qkg.641.1642365318833; 
 Sun, 16 Jan 2022 12:35:18 -0800 (PST)
Received: from localhost.localdomain
 (bras-base-hullpq2034w-grc-18-74-15-213-135.dsl.bell.ca. [74.15.213.135])
 by smtp.gmail.com with ESMTPSA id bk14sm777876qkb.35.2022.01.16.12.35.17
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Sun, 16 Jan 2022 12:35:18 -0800 (PST)
From: Luc Pelletier <lucp.at.work@gmail.com>
To: bruce.richardson@intel.com,
	konstantin.ananyev@intel.com
Cc: dev@dpdk.org, Luc Pelletier <lucp.at.work@gmail.com>,
 Xiaoyun Li <xiaoyun.li@intel.com>, stable@dpdk.org
Subject: [PATCH v4] eal: fix unaligned loads/stores in rte_memcpy_generic
Date: Sun, 16 Jan 2022 15:33:13 -0500
Message-Id: <20220116203312.6405-1-lucp.at.work@gmail.com>
In-Reply-To: <20220115194102.444140-1-lucp.at.work@gmail.com>
References: <20220115194102.444140-1-lucp.at.work@gmail.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use inline assembly for the load/store operations.
Unaligned load/store operations are permitted in x86/x64 assembly.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---

Added volatile for asm statements. I had been testing with clang and it
was working but it seems gcc requires this, otherwise the asm is
completely discarded by the optimizer. The following note in the gcc
docs seems to explain why:

"If the C code that follows the asm makes no use of any of the output
operands, use volatile for the asm statement to prevent the optimizers
from discarding the asm statement as unneeded (see Volatile)."

 lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..5af5c2ed61 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,75 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+#if defined(__i386__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "eax"
+#elif defined(__x86_64__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "rax"
+#endif
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	if (n & 8) {
+		asm volatile(
+#if defined(__i386__)
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+#elif defined(__x86_64__)
+		"movq (%[src]), %%rax\n"
+		"movq %%rax, (%[dst])\n"
+		"add $8, %[src]\n"
+		"add $8, %[dst]\n"
+#else
+		#error Unsupported architecture
+#endif
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 4) {
+		asm volatile(
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 2) {
+		asm volatile(
+		"movw (%[src]), %%ax\n"
+		"movw %%ax, (%[dst])\n"
+		"add $2, %[src]\n"
+		"add $2, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 1) {
+		asm volatile(
+		"movb (%[src]), %%al\n"
+		"movb %%al, (%[dst])\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +702,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1