DPDK patches and discussions
 help / color / mirror / Atom feed
From: zhihong.wang@intel.com
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms
Date: Mon, 19 Jan 2015 09:53:34 +0800	[thread overview]
Message-ID: <1421632414-10027-5-git-send-email-zhihong.wang@intel.com> (raw)
In-Reply-To: <1421632414-10027-1-git-send-email-zhihong.wang@intel.com>

Main code changes:

1. Differentiate architectural features based on CPU flags

    a. Implement separated move functions for SSE/AVX/AVX2 to make full utilization of cache bandwidth

    b. Implement separated copy flow specifically optimized for target architecture

2. Rewrite the memcpy function "rte_memcpy"

    a. Add store aligning

    b. Add load aligning based on architectural features

    c. Put block copy loop into inline move functions for better control of instruction order

    d. Eliminate unnecessary MOVs

3. Rewrite the inline move functions

    a. Add move functions for unaligned load cases

    b. Change instruction order in copy loops for better pipeline utilization

    c. Use intrinsics instead of assembly code

4. Remove slow glibc call for constant copies

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 .../common/include/arch/x86/rte_memcpy.h           | 664 +++++++++++++++------
 1 file changed, 493 insertions(+), 171 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index fb9eba8..69a5c6f 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -34,166 +34,189 @@
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_
 
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ */
+
+#include <stdio.h>
 #include <stdint.h>
 #include <string.h>
-#include <emmintrin.h>
+#include <x86intrin.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "generic/rte_memcpy.h"
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
 
-#ifdef __INTEL_COMPILER
-#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
-#endif
+#ifdef RTE_MACHINE_CPUFLAG_AVX2
 
+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		: [reg_a] "=x" (reg_a)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
-}
+	__m256i ymm0;
 
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-	__m128i reg_a, reg_b, reg_c;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu 32(%[src]), %[reg_c]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		"movdqu %[reg_c], 32(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b),
-		  [reg_c] "=x" (reg_c)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
 }
 
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b, reg_c, reg_d;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu 32(%[src]), %[reg_c]\n\t"
-		"movdqu 48(%[src]), %[reg_d]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		"movdqu %[reg_c], 32(%[dst])\n\t"
-		"movdqu %[reg_d], 48(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b),
-		  [reg_c] "=x" (reg_c),
-		  [reg_d] "=x" (reg_d)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 }
 
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu 32(%[src]), %[reg_c]\n\t"
-		"movdqu 48(%[src]), %[reg_d]\n\t"
-		"movdqu 64(%[src]), %[reg_e]\n\t"
-		"movdqu 80(%[src]), %[reg_f]\n\t"
-		"movdqu 96(%[src]), %[reg_g]\n\t"
-		"movdqu 112(%[src]), %[reg_h]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		"movdqu %[reg_c], 32(%[dst])\n\t"
-		"movdqu %[reg_d], 48(%[dst])\n\t"
-		"movdqu %[reg_e], 64(%[dst])\n\t"
-		"movdqu %[reg_f], 80(%[dst])\n\t"
-		"movdqu %[reg_g], 96(%[dst])\n\t"
-		"movdqu %[reg_h], 112(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b),
-		  [reg_c] "=x" (reg_c),
-		  [reg_d] "=x" (reg_d),
-		  [reg_e] "=x" (reg_e),
-		  [reg_f] "=x" (reg_f),
-		  [reg_g] "=x" (reg_g),
-		  [reg_h] "=x" (reg_h)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 }
 
-#ifdef __INTEL_COMPILER
-#pragma warning(enable:593)
-#endif
-
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov128(dst, src);
-	rte_mov128(dst + 128, src + 128);
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
+	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
+	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
+	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
 }
 
-#define rte_memcpy(dst, src, n)              \
-	({ (__builtin_constant_p(n)) ?       \
-	memcpy((dst), (src), (n)) :          \
-	rte_memcpy_func((dst), (src), (n)); })
+/**
+ * Copy 64-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m256i ymm0, ymm1;
+
+	while (n >= 64) {
+		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
+		n -= 64;
+		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
+		src = (const uint8_t *)src + 64;
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
+		dst = (uint8_t *)dst + 64;
+	}
+}
+
+/**
+ * Copy 256-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+
+	while (n >= 256) {
+		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
+		n -= 256;
+		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
+		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
+		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
+		ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
+		ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
+		ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
+		ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
+		src = (const uint8_t *)src + 256;
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
+		dst = (uint8_t *)dst + 256;
+	}
+}
 
 static inline void *
-rte_memcpy_func(void *dst, const void *src, size_t n)
+rte_memcpy(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
+	int dstofss;
+	int bits;
 
-	/* We can't copy < 16 bytes using XMM registers so do it manually. */
+	/**
+	 * Copy less than 16 bytes
+	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dst = *(const uint8_t *)src;
-			dst = (uint8_t *)dst + 1;
 			src = (const uint8_t *)src + 1;
+			dst = (uint8_t *)dst + 1;
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dst = *(const uint16_t *)src;
-			dst = (uint16_t *)dst + 1;
 			src = (const uint16_t *)src + 1;
+			dst = (uint16_t *)dst + 1;
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dst = *(const uint32_t *)src;
-			dst = (uint32_t *)dst + 1;
 			src = (const uint32_t *)src + 1;
+			dst = (uint32_t *)dst + 1;
 		}
 		if (n & 0x08) {
 			*(uint64_t *)dst = *(const uint64_t *)src;
@@ -201,95 +224,394 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 		return ret;
 	}
 
-	/* Special fast cases for <= 128 bytes */
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
-
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 		return ret;
 	}
-
-	if (n <= 128) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n);
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+COPY_BLOCK_64_BACK31:
+		if (n > 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+		}
 		return ret;
 	}
 
-	/*
-	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
-	 * copies was found to be faster than doing 128 and 32 byte copies as
-	 * well.
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes
 	 */
-	for ( ; n >= 256; n -= 256) {
-		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-		dst = (uint8_t *)dst + 256;
-		src = (const uint8_t *)src + 256;
+	dstofss = 32 - (int)((long long)(void *)dst & 0x1F);
+	n -= dstofss;
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	src = (const uint8_t *)src + dstofss;
+	dst = (uint8_t *)dst + dstofss;
+
+	/**
+	 * Copy 256-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 255;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy 64-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	if (n >= 64) {
+		rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		bits = n;
+		n = n & 63;
+		bits -= n;
+		src = (const uint8_t *)src + bits;
+		dst = (uint8_t *)dst + bits;
 	}
 
-	/*
-	 * We split the remaining bytes (which will be less than 256) into
-	 * 64byte (2^6) chunks.
-	 * Using incrementing integers in the case labels of a switch statement
-	 * enourages the compiler to use a jump table. To get incrementing
-	 * integers, we shift the 2 relevant bits to the LSB position to first
-	 * get decrementing integers, and then subtract.
+	/**
+	 * Copy whatever left
 	 */
-	switch (3 - (n >> 6)) {
-	case 0x00:
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		n -= 64;
-		dst = (uint8_t *)dst + 64;
-		src = (const uint8_t *)src + 64;      /* fallthrough */
-	case 0x01:
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		n -= 64;
-		dst = (uint8_t *)dst + 64;
-		src = (const uint8_t *)src + 64;      /* fallthrough */
-	case 0x02:
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		n -= 64;
-		dst = (uint8_t *)dst + 64;
-		src = (const uint8_t *)src + 64;      /* fallthrough */
-	default:
-		;
+	goto COPY_BLOCK_64_BACK31;
+}
+
+#else /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+/**
+ * SSE & AVX implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+}
+
+/**
+ * Macro for copying unaligned block from one location to another,
+ * 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be immediate value within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> must be pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                                                         \
+{                                                                                                           \
+	int tmp;                                                                                                \
+	while (len >= 128 + 16 - offset) {                                                                      \
+		xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
+		len -= 128;                                                                                         \
+		xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
+		xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
+		xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
+		xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
+		xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
+		xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
+		xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
+		xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
+		src = (const uint8_t *)src + 128;                                                                   \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
+		dst = (uint8_t *)dst + 128;                                                                         \
+	}                                                                                                       \
+	tmp = len;                                                                                              \
+	len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
+	tmp -= len;                                                                                             \
+	src = (const uint8_t *)src + tmp;                                                                       \
+	dst = (uint8_t *)dst + tmp;                                                                             \
+	if (len >= 32 + 16 - offset) {                                                                          \
+		while (len >= 32 + 16 - offset) {                                                                   \
+			xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
+			len -= 32;                                                                                      \
+			xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
+			xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
+			src = (const uint8_t *)src + 32;                                                                \
+			_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
+			_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
+			dst = (uint8_t *)dst + 32;                                                                      \
+		}                                                                                                   \
+		tmp = len;                                                                                          \
+		len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
+		tmp -= len;                                                                                         \
+		src = (const uint8_t *)src + tmp;                                                                   \
+		dst = (uint8_t *)dst + tmp;                                                                         \
+	}                                                                                                       \
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+	void *ret = dst;
+	int dstofss;
+	int srcofs;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dst = *(const uint8_t *)src;
+			src = (const uint8_t *)src + 1;
+			dst = (uint8_t *)dst + 1;
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dst = *(const uint16_t *)src;
+			src = (const uint16_t *)src + 1;
+			dst = (uint16_t *)dst + 1;
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dst = *(const uint32_t *)src;
+			src = (const uint32_t *)src + 1;
+			dst = (uint32_t *)dst + 1;
+		}
+		if (n & 0x08) {
+			*(uint64_t *)dst = *(const uint64_t *)src;
+		}
+		return ret;
 	}
 
-	/*
-	 * We split the remaining bytes (which will be less than 64) into
-	 * 16byte (2^4) chunks, using the same switch structure as above.
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	switch (3 - (n >> 4)) {
-	case 0x00:
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		n -= 16;
-		dst = (uint8_t *)dst + 16;
-		src = (const uint8_t *)src + 16;      /* fallthrough */
-	case 0x01:
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		n -= 16;
-		dst = (uint8_t *)dst + 16;
-		src = (const uint8_t *)src + 16;      /* fallthrough */
-	case 0x02:
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		n -= 16;
-		dst = (uint8_t *)dst + 16;
-		src = (const uint8_t *)src + 16;      /* fallthrough */
-	default:
-		;
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
 	}
-
-	/* Copy any remaining bytes, without going beyond end of buffers */
-	if (n != 0) {
+	if (n <= 48) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
 	}
-	return ret;
+	if (n <= 128) {
+		goto COPY_BLOCK_128_BACK15;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+COPY_BLOCK_255_BACK15:
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK15:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+COPY_BLOCK_64_BACK15:
+		if (n >= 32) {
+			n -= 32;
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 32;
+			dst = (uint8_t *)dst + 32;
+		}
+		if (n > 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes,
+	 * and make sure the first 15 bytes are copied, because
+	 * unaligned copy functions require up to 15 bytes
+	 * backwards access.
+	 */
+	dstofss = 16 - (int)((long long)(void *)dst & 0x0F) + 16;
+	n -= dstofss;
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	src = (const uint8_t *)src + dstofss;
+	dst = (uint8_t *)dst + dstofss;
+	srcofs = (int)((long long)(const void *)src & 0x0F);
+
+	/**
+	 * For aligned copy
+	 */
+	if (srcofs == 0) {
+		/**
+		 * Copy 256-byte blocks
+		 */
+		for (; n >= 256; n -= 256) {
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			dst = (uint8_t *)dst + 256;
+			src = (const uint8_t *)src + 256;
+		}
+
+		/**
+		 * Copy whatever left
+		 */
+		goto COPY_BLOCK_255_BACK15;
+	}
+
+	/**
+	 * For copy with unaligned load, use PALIGNR to force load alignment.
+	 * Use switch here because PALIGNR requires immediate value for shift count.
+	 */
+	switch (srcofs) {
+	case 0x01: MOVEUNALIGNED_LEFT47(dst, src, n, 0x01); break;
+	case 0x02: MOVEUNALIGNED_LEFT47(dst, src, n, 0x02); break;
+	case 0x03: MOVEUNALIGNED_LEFT47(dst, src, n, 0x03); break;
+	case 0x04: MOVEUNALIGNED_LEFT47(dst, src, n, 0x04); break;
+	case 0x05: MOVEUNALIGNED_LEFT47(dst, src, n, 0x05); break;
+	case 0x06: MOVEUNALIGNED_LEFT47(dst, src, n, 0x06); break;
+	case 0x07: MOVEUNALIGNED_LEFT47(dst, src, n, 0x07); break;
+	case 0x08: MOVEUNALIGNED_LEFT47(dst, src, n, 0x08); break;
+	case 0x09: MOVEUNALIGNED_LEFT47(dst, src, n, 0x09); break;
+	case 0x0A: MOVEUNALIGNED_LEFT47(dst, src, n, 0x0A); break;
+	case 0x0B: MOVEUNALIGNED_LEFT47(dst, src, n, 0x0B); break;
+	case 0x0C: MOVEUNALIGNED_LEFT47(dst, src, n, 0x0C); break;
+	case 0x0D: MOVEUNALIGNED_LEFT47(dst, src, n, 0x0D); break;
+	case 0x0E: MOVEUNALIGNED_LEFT47(dst, src, n, 0x0E); break;
+	case 0x0F: MOVEUNALIGNED_LEFT47(dst, src, n, 0x0F); break;
+	default:;
+	}
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_64_BACK15;
 }
 
+#endif /* RTE_MACHINE_CPUFLAG_AVX2 */
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.9.3

  parent reply	other threads:[~2015-01-19  1:54 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-19  1:53 [dpdk-dev] [PATCH 0/4] DPDK memcpy optimization zhihong.wang
2015-01-19  1:53 ` [dpdk-dev] [PATCH 1/4] app/test: Disabled VTA for memcpy test in app/test/Makefile zhihong.wang
2015-01-19  1:53 ` [dpdk-dev] [PATCH 2/4] app/test: Removed unnecessary test cases in test_memcpy.c zhihong.wang
2015-01-19  1:53 ` [dpdk-dev] [PATCH 3/4] app/test: Extended test coverage in test_memcpy_perf.c zhihong.wang
2015-01-19  1:53 ` zhihong.wang [this message]
2015-01-20 17:15   ` [dpdk-dev] [PATCH 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms Stephen Hemminger
2015-01-20 19:16     ` Neil Horman
2015-01-21  3:18       ` Wang, Zhihong
2015-01-25 20:02     ` Jim Thompson
2015-01-26 14:43   ` Wodkowski, PawelX
2015-01-27  5:12     ` Wang, Zhihong
2015-01-19 13:02 ` [dpdk-dev] [PATCH 0/4] DPDK memcpy optimization Neil Horman
2015-01-20  3:01   ` Wang, Zhihong
2015-01-20 15:11     ` Neil Horman
2015-01-20 16:14       ` Bruce Richardson
2015-01-21  3:44         ` Wang, Zhihong
2015-01-21 11:40           ` Bruce Richardson
2015-01-21 12:02           ` Ananyev, Konstantin
2015-01-21 12:38             ` Neil Horman
2015-01-23  3:26               ` Wang, Zhihong
2015-01-21 12:36           ` Marc Sune
2015-01-21 13:02             ` Bruce Richardson
2015-01-21 13:21               ` Marc Sune
2015-01-21 13:26                 ` Bruce Richardson
2015-01-21 19:49                   ` Stephen Hemminger
2015-01-21 20:54                     ` Neil Horman
2015-01-21 21:25                       ` Jim Thompson
2015-01-22  0:53                         ` Stephen Hemminger
2015-01-22  9:06                         ` Luke Gorrie
2015-01-22 13:29                           ` Jay Rolette
2015-01-22 18:27                             ` Luke Gorrie
2015-01-22 19:36                               ` Jay Rolette
2015-01-22 18:21                       ` EDMISON, Kelvin (Kelvin)
2015-01-27  8:22                         ` Wang, Zhihong
2015-01-28 21:48                           ` EDMISON, Kelvin (Kelvin)
2015-01-29  1:53                             ` Wang, Zhihong
2015-01-23  6:52                   ` Wang, Zhihong
2015-01-26 18:29                     ` Ananyev, Konstantin
2015-01-27  1:42                       ` Wang, Zhihong
2015-01-27 11:30                         ` Ananyev, Konstantin
2015-01-27 12:19                           ` Ananyev, Konstantin
2015-01-28  2:06                             ` Wang, Zhihong
2015-01-25 14:50 ` Luke Gorrie
2015-01-26  1:30   ` Wang, Zhihong
2015-01-26  8:03     ` Luke Gorrie
2015-01-27  7:19       ` Wang, Zhihong
2015-01-27 13:57         ` [dpdk-dev] [snabb-devel] " Luke Gorrie
2015-01-29  3:42 ` [dpdk-dev] " Fu, JingguoX

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1421632414-10027-5-git-send-email-zhihong.wang@intel.com \
    --to=zhihong.wang@intel.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).