From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <zhihong.wang@intel.com>
Received: from mga14.intel.com (mga14.intel.com [192.55.52.115])
 by dpdk.org (Postfix) with ESMTP id 120AD2BDC
 for <dev@dpdk.org>; Wed, 25 May 2016 10:28:41 +0200 (CEST)
Received: from fmsmga001.fm.intel.com ([10.253.24.23])
 by fmsmga103.fm.intel.com with ESMTP; 25 May 2016 01:28:28 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.26,363,1459839600"; d="scan'208";a="974102523"
Received: from unknown (HELO dpdk5.sh.intel.com) ([10.239.129.244])
 by fmsmga001.fm.intel.com with ESMTP; 25 May 2016 01:28:27 -0700
From: Zhihong Wang <zhihong.wang@intel.com>
To: dev@dpdk.org
Cc: Zhihong Wang <zhihong.wang@intel.com>
Date: Tue, 24 May 2016 21:23:03 -0400
Message-Id: <1464139383-132732-1-git-send-email-zhihong.wang@intel.com>
X-Mailer: git-send-email 2.5.0
Subject: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Wed, 25 May 2016 08:28:42 -0000

This patch fixes rte_memcpy performance in Haswell and Broadwell for
vhost when copy size larger than 256 bytes.

It is observed that for large copies like 1024/1518 ones, rte_memcpy
suffers high ratio of store buffer full issue which causes pipeline
to stall in scenarios like vhost enqueue. This can be alleviated by
adjusting instruction layout. Note that this issue may not be visible
in micro test.

How to reproduce?

PHY-VM-PHY using vhost/virtio or vhost/virtio loop back, with large
packets like 1024/1518 bytes ones. Make sure packet generation rate
is not the bottleneck if PHY-VM-PHY is used.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 .../common/include/arch/x86/rte_memcpy.h           | 116 ++++++---------------
 1 file changed, 30 insertions(+), 86 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index f463ab3..413035e 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 }
 
 /**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
-/**
- * Copy 64-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-	__m256i ymm0, ymm1;
-
-	while (n >= 64) {
-		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 64;
-		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
-		src = (const uint8_t *)src + 64;
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
-		dst = (uint8_t *)dst + 64;
-	}
-}
-
-/**
- * Copy 256-byte blocks from one location to another,
+ * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
 static inline void
-rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
-	__m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+	__m256i ymm0, ymm1, ymm2, ymm3;
 
-	while (n >= 256) {
+	while (n >= 128) {
 		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 256;
+		n -= 128;
 		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
-		ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
-		ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
-		ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
-		ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
-		src = (const uint8_t *)src + 256;
+		src = (const uint8_t *)src + 128;
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
-		dst = (uint8_t *)dst + 256;
+		dst = (uint8_t *)dst + 128;
 	}
 }
 
@@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n)
 	}
 
 	/**
-	 * Fast way when copy size doesn't exceed 512 bytes
+	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
-	if (n <= 512) {
-		if (n >= 256) {
-			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 256;
-			dst = (uint8_t *)dst + 256;
-		}
+	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
+COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
-COPY_BLOCK_64_BACK31:
 		if (n > 32) {
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
 	}
 
 	/**
-	 * Make store aligned when copy size exceeds 512 bytes
+	 * Make store aligned when copy size exceeds 256 bytes
 	 */
 	dstofss = (uintptr_t)dst & 0x1F;
 	if (dstofss > 0) {
@@ -522,35 +482,19 @@ COPY_BLOCK_64_BACK31:
 	}
 
 	/**
-	 * Copy 256-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
+	 * Copy 128-byte blocks
 	 */
-	rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
-	n = n & 255;
+	n = n & 127;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 
 	/**
-	 * Copy 64-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
-	 */
-	if (n >= 64) {
-		rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
-		bits = n;
-		n = n & 63;
-		bits -= n;
-		src = (const uint8_t *)src + bits;
-		dst = (uint8_t *)dst + bits;
-	}
-
-	/**
 	 * Copy whatever left
 	 */
-	goto COPY_BLOCK_64_BACK31;
+	goto COPY_BLOCK_128_BACK31;
 }
 
 #else /* RTE_MACHINE_CPUFLAG */
-- 
2.5.0