DPDK patches and discussions
 help / color / mirror / Atom feed
From: Xiaoyun Li <xiaoyun.li@intel.com>
To: bruce.richardson@intel.com
Cc: dev@dpdk.org, zhihong.wang@intel.com, qi.z.zhang@intel.com,
	wenzhuo.lu@intel.com, Xiaoyun Li <xiaoyun.li@intel.com>
Subject: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
Date: Fri,  1 Sep 2017 16:57:00 +0800	[thread overview]
Message-ID: <1504256222-32969-2-git-send-email-xiaoyun.li@intel.com> (raw)
In-Reply-To: <1504256222-32969-1-git-send-email-xiaoyun.li@intel.com>

This patch dynamically selects functions of memcpy at run-time based
on CPU flags that current machine supports. This patch uses function
pointers which are bind to the relative functions at constrctor time.
In addition, AVX512 instructions set would be compiled only if users
config it enabled and the compiler supports it.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
v2 
* use gcc function multi-versioning to avoid compilation issue.
* add macros for AVX512 and AVX2. Only if users enable AVX512 and the
compiler supports it, the AVX512 codes would be compiled. Only if the
compiler supports AVX2, the AVX2 codes would be compiled.

 .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
 mk/rte.cpuflags.mk                                 |  14 +
 2 files changed, 231 insertions(+), 126 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 74c280c..abba6ad 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -45,11 +45,45 @@
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
+#include <rte_cpuflags.h>
+#include <rte_log.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*
+ * Select SSE/AVX memory copy method as default one.
+ */
+
+static uint16_t alignment_mask = 0x0F;
+
+typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);
+#ifdef CC_SUPPORT_AVX2
+typedef void (*rte_mov128blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
+#endif
+#ifdef CC_SUPPORT_AVX512
+typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
+#endif
+typedef void * (*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);
+
+static rte_mov16_t rte_mov16;
+static rte_mov32_t rte_mov32;
+static rte_mov64_t rte_mov64;
+static rte_mov128_t rte_mov128;
+static rte_mov256_t rte_mov256;
+#ifdef CC_SUPPORT_AVX2
+static rte_mov128blocks_t rte_mov128blocks;
+#endif
+#ifdef CC_SUPPORT_AVX512
+static rte_mov512blocks_t rte_mov512blocks;
+#endif
+static rte_memcpy_generic_t rte_memcpy_generic;
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -68,10 +102,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#ifdef RTE_MACHINE_CPUFLAG_AVX512F
-
-#define ALIGNMENT_MASK 0x3F
-
 /**
  * AVX512 implementation below
  */
@@ -80,8 +110,10 @@ rte_memcpy(void *dst, const void *src, size_t n);
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
  */
+#ifdef CC_SUPPORT_AVX512
+__attribute__((target("avx512f")))
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -93,8 +125,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * Copy 32 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 
@@ -106,8 +139,9 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m512i zmm0;
 
@@ -119,32 +153,35 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
+	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
 }
 
 /**
  * Copy 256 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
+	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
+	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
+	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
 }
 
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -163,8 +200,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
  * Copy 512-byte blocks from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -191,8 +229,9 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+__attribute__((target("avx512f")))
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
@@ -228,39 +267,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				  (const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK63:
 		if (n > 64) {
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov64((uint8_t *)dst - 64 + n,
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 			return ret;
 		}
 		if (n > 0)
-			rte_mov64((uint8_t *)dst - 64 + n,
+			(*rte_mov64)((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 		return ret;
 	}
@@ -272,7 +311,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 64 - dstofss;
 		n -= dstofss;
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -282,7 +321,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
-	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 511;
 	bits -= n;
@@ -295,7 +334,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * which is important when load is unaligned.
 	 */
 	if (n >= 128) {
-		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 		bits = n;
 		n = n & 127;
 		bits -= n;
@@ -308,10 +347,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	goto COPY_BLOCK_128_BACK63;
 }
-
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
-
-#define ALIGNMENT_MASK 0x1F
+#endif
 
 /**
  * AVX2 implementation below
@@ -321,8 +357,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
  */
+#ifdef CC_SUPPORT_AVX2
+__attribute__((target("avx2")))
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -334,8 +372,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * Copy 32 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 
@@ -347,32 +386,35 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 }
 
 /**
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 }
 
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -391,8 +433,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+__attribute__((target("avx2")))
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
@@ -429,46 +472,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 		if (n > 32) {
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n,
+			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov32)((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n,
+			(*rte_mov32)((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
@@ -481,7 +524,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 32 - dstofss;
 		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -489,7 +532,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Copy 128-byte blocks
 	 */
-	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 127;
 	bits -= n;
@@ -501,10 +544,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	goto COPY_BLOCK_128_BACK31;
 }
-
-#else /* RTE_MACHINE_CPUFLAG */
-
-#define ALIGNMENT_MASK 0x0F
+#endif
 
 /**
  * SSE & AVX implementation below
@@ -514,8 +554,9 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -527,66 +568,70 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * Copy 32 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 }
 
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 }
 
 /**
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 }
 
 /**
  * Copy 256 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 }
 
 /**
@@ -683,8 +728,9 @@ __extension__ ({                                                      \
     }                                                                 \
 })
 
+__attribute__((target("default")))
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	uintptr_t dstu = (uintptr_t)dst;
@@ -722,19 +768,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 128) {
@@ -743,39 +792,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst + 128,
+					(const uint8_t *)src + 128);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 COPY_BLOCK_255_BACK15:
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK15:
 		if (n >= 64) {
 			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 COPY_BLOCK_64_BACK15:
 		if (n >= 32) {
 			n -= 32;
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 32;
 			dst = (uint8_t *)dst + 32;
 		}
 		if (n > 16) {
-			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov16)((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			(*rte_mov16)((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
 		}
 		return ret;
 	}
@@ -790,7 +842,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 16 - dstofss + 16;
 		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -804,7 +856,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 		 * Copy 256-byte blocks
 		 */
 		for (; n >= 256; n -= 256) {
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
 			dst = (uint8_t *)dst + 256;
 			src = (const uint8_t *)src + 256;
 		}
@@ -826,7 +878,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_64_BACK15;
 }
 
-#endif /* RTE_MACHINE_CPUFLAG */
+static void __attribute__((constructor))
+rte_memcpy_init(void)
+{
+#ifdef CC_SUPPORT_AVX512
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
+		alignment_mask = 0x3F;
+		rte_mov16 = rte_mov16_AVX512F;
+		rte_mov32 = rte_mov32_AVX512F;
+		rte_mov64 = rte_mov64_AVX512F;
+		rte_mov128 = rte_mov128_AVX512F;
+		rte_mov256 = rte_mov256_AVX512F;
+		rte_mov128blocks = rte_mov128blocks_AVX512F;
+		rte_mov512blocks = rte_mov512blocks_AVX512F;
+		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
+		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
+	} else
+#endif
+#ifdef CC_SUPPORT_AVX2
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+		alignment_mask = 0x1F;
+		rte_mov16 = rte_mov16_AVX2;
+		rte_mov32 = rte_mov32_AVX2;
+		rte_mov64 = rte_mov64_AVX2;
+		rte_mov128 = rte_mov128_AVX2;
+		rte_mov128blocks = rte_mov128blocks_AVX2;
+		rte_memcpy_generic = rte_memcpy_generic_AVX2;
+		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
+	} else
+#endif
+	{
+		alignment_mask = 0x0F;
+		rte_mov16 = rte_mov16_DEFAULT;
+		rte_mov32 = rte_mov32_DEFAULT;
+		rte_mov64 = rte_mov64_DEFAULT;
+		rte_mov128 = rte_mov128_DEFAULT;
+		rte_mov256 = rte_mov256_DEFAULT;
+		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
+		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
+	}
+}
 
 static inline void *
 rte_memcpy_aligned(void *dst, const void *src, size_t n)
@@ -858,8 +949,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 16 <= size <= 32 bytes */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
 		return ret;
@@ -867,8 +958,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 32 < size <= 64 bytes */
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 
 		return ret;
@@ -876,13 +967,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 64 bytes blocks */
 	for (; n >= 64; n -= 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 		dst = (uint8_t *)dst + 64;
 		src = (const uint8_t *)src + 64;
 	}
 
 	/* Copy whatever left */
-	rte_mov64((uint8_t *)dst - 64 + n,
+	(*rte_mov64)((uint8_t *)dst - 64 + n,
 			(const uint8_t *)src - 64 + n);
 
 	return ret;
@@ -891,10 +982,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
-	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
 		return rte_memcpy_aligned(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return (*rte_memcpy_generic)(dst, src, n);
 }
 
 #ifdef __cplusplus
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index a813c91..92399ec 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -141,3 +141,17 @@ space:= $(empty) $(empty)
 CPUFLAGSTMP1 := $(addprefix RTE_CPUFLAG_,$(CPUFLAGS))
 CPUFLAGSTMP2 := $(subst $(space),$(comma),$(CPUFLAGSTMP1))
 CPUFLAGS_LIST := -DRTE_COMPILE_TIME_CPUFLAGS=$(CPUFLAGSTMP2)
+
+# Check if the compiler supports AVX512.
+CC_SUPPORT_AVX512 := $(shell $(CC) -march=skylake-avx512 -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
+ifeq ($(CC_SUPPORT_AVX512),1)
+ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
+MACHINE_CFLAGS += -DCC_SUPPORT_AVX512
+endif
+endif
+
+# Check if the compiler supports AVX2.
+CC_SUPPORT_AVX2 := $(shell $(CC) -march=core-avx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
+ifeq ($(CC_SUPPORT_AVX2),1)
+MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
+endif
-- 
2.7.4

  reply	other threads:[~2017-09-01  8:58 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-08-25  2:06 [dpdk-dev] [PATCH 0/3] dynamic linking support Xiaoyun Li
2017-08-25  2:06 ` [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
2017-08-30 14:56   ` Ananyev, Konstantin
2017-08-30 17:51     ` Bruce Richardson
2017-08-31  1:21       ` Lu, Wenzhuo
2017-08-30 18:00   ` Stephen Hemminger
2017-08-31  1:23     ` Lu, Wenzhuo
2017-08-31  5:05       ` Stephen Hemminger
2017-08-31  5:24         ` Li, Xiaoyun
2017-08-25  2:06 ` [dpdk-dev] [PATCH 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
2017-08-25  2:06 ` [dpdk-dev] [PATCH 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li
2017-09-01  8:56 ` [dpdk-dev] [PATCH v2 0/3] dynamic linking support Xiaoyun Li
2017-09-01  8:57   ` Xiaoyun Li [this message]
2017-09-01  9:16     ` [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Ananyev, Konstantin
2017-09-01  9:28       ` Li, Xiaoyun
2017-09-01 10:38         ` Ananyev, Konstantin
2017-09-04  1:41           ` Li, Xiaoyun
     [not found]             ` <B9E724F4CB7543449049E7AE7669D82F44216E@SHSMSX101.ccr.corp.intel.com>
     [not found]               ` <B9E724F4CB7543449049E7AE7669D82F442FE6@SHSMSX101.ccr.corp.intel.com>
2017-09-12  2:27                 ` Li, Xiaoyun
2017-09-20  6:57                   ` Li, Xiaoyun
2017-09-01 15:34     ` Stephen Hemminger
2017-09-01  8:57   ` [dpdk-dev] [PATCH v2 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
2017-09-01  8:57   ` [dpdk-dev] [PATCH v2 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1504256222-32969-2-git-send-email-xiaoyun.li@intel.com \
    --to=xiaoyun.li@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=dev@dpdk.org \
    --cc=qi.z.zhang@intel.com \
    --cc=wenzhuo.lu@intel.com \
    --cc=zhihong.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).