From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga14.intel.com (mga14.intel.com [192.55.52.115]) by dpdk.org (Postfix) with ESMTP id 120AD2BDC for ; Wed, 25 May 2016 10:28:41 +0200 (CEST) Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by fmsmga103.fm.intel.com with ESMTP; 25 May 2016 01:28:28 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.26,363,1459839600"; d="scan'208";a="974102523" Received: from unknown (HELO dpdk5.sh.intel.com) ([10.239.129.244]) by fmsmga001.fm.intel.com with ESMTP; 25 May 2016 01:28:27 -0700 From: Zhihong Wang To: dev@dpdk.org Cc: Zhihong Wang Date: Tue, 24 May 2016 21:23:03 -0400 Message-Id: <1464139383-132732-1-git-send-email-zhihong.wang@intel.com> X-Mailer: git-send-email 2.5.0 Subject: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 25 May 2016 08:28:42 -0000 This patch fixes rte_memcpy performance in Haswell and Broadwell for vhost when copy size larger than 256 bytes. It is observed that for large copies like 1024/1518 ones, rte_memcpy suffers high ratio of store buffer full issue which causes pipeline to stall in scenarios like vhost enqueue. This can be alleviated by adjusting instruction layout. Note that this issue may not be visible in micro test. How to reproduce? PHY-VM-PHY using vhost/virtio or vhost/virtio loop back, with large packets like 1024/1518 bytes ones. Make sure packet generation rate is not the bottleneck if PHY-VM-PHY is used. Signed-off-by: Zhihong Wang --- .../common/include/arch/x86/rte_memcpy.h | 116 ++++++--------------- 1 file changed, 30 insertions(+), 86 deletions(-) diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index f463ab3..413035e 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src) } /** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} - -/** - * Copy 64-byte blocks from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n) -{ - __m256i ymm0, ymm1; - - while (n >= 64) { - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); - n -= 64; - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); - src = (const uint8_t *)src + 64; - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); - dst = (uint8_t *)dst + 64; - } -} - -/** - * Copy 256-byte blocks from one location to another, + * Copy 128-byte blocks from one location to another, * locations should not overlap. */ static inline void -rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n) +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { - __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + __m256i ymm0, ymm1, ymm2, ymm3; - while (n >= 256) { + while (n >= 128) { ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); - n -= 256; + n -= 128; ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); - ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32)); - ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32)); - ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32)); - ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32)); - src = (const uint8_t *)src + 256; + src = (const uint8_t *)src + 128; _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7); - dst = (uint8_t *)dst + 256; + dst = (uint8_t *)dst + 128; } } @@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n) } /** - * Fast way when copy size doesn't exceed 512 bytes + * Fast way when copy size doesn't exceed 256 bytes */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); return ret; } - if (n <= 512) { - if (n >= 256) { - n -= 256; - rte_mov256((uint8_t *)dst, (const uint8_t *)src); - src = (const uint8_t *)src + 256; - dst = (uint8_t *)dst + 256; - } + if (n <= 256) { if (n >= 128) { n -= 128; rte_mov128((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 128; dst = (uint8_t *)dst + 128; } +COPY_BLOCK_128_BACK31: if (n >= 64) { n -= 64; rte_mov64((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 64; dst = (uint8_t *)dst + 64; } -COPY_BLOCK_64_BACK31: if (n > 32) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); return ret; } if (n > 0) { - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); } return ret; } /** - * Make store aligned when copy size exceeds 512 bytes + * Make store aligned when copy size exceeds 256 bytes */ dstofss = (uintptr_t)dst & 0x1F; if (dstofss > 0) { @@ -522,35 +482,19 @@ COPY_BLOCK_64_BACK31: } /** - * Copy 256-byte blocks. - * Use copy block function for better instruction order control, - * which is important when load is unaligned. + * Copy 128-byte blocks */ - rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n); + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); bits = n; - n = n & 255; + n = n & 127; bits -= n; src = (const uint8_t *)src + bits; dst = (uint8_t *)dst + bits; /** - * Copy 64-byte blocks. - * Use copy block function for better instruction order control, - * which is important when load is unaligned. - */ - if (n >= 64) { - rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n); - bits = n; - n = n & 63; - bits -= n; - src = (const uint8_t *)src + bits; - dst = (uint8_t *)dst + bits; - } - - /** * Copy whatever left */ - goto COPY_BLOCK_64_BACK31; + goto COPY_BLOCK_128_BACK31; } #else /* RTE_MACHINE_CPUFLAG */ -- 2.5.0