DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw
@ 2016-05-25  1:23 Zhihong Wang
  2016-05-26  5:19 ` Xu, Qian Q
  2016-06-15 14:21 ` Thomas Monjalon
  0 siblings, 2 replies; 4+ messages in thread
From: Zhihong Wang @ 2016-05-25  1:23 UTC (permalink / raw)
  To: dev; +Cc: Zhihong Wang

This patch fixes rte_memcpy performance in Haswell and Broadwell for
vhost when copy size larger than 256 bytes.

It is observed that for large copies like 1024/1518 ones, rte_memcpy
suffers high ratio of store buffer full issue which causes pipeline
to stall in scenarios like vhost enqueue. This can be alleviated by
adjusting instruction layout. Note that this issue may not be visible
in micro test.

How to reproduce?

PHY-VM-PHY using vhost/virtio or vhost/virtio loop back, with large
packets like 1024/1518 bytes ones. Make sure packet generation rate
is not the bottleneck if PHY-VM-PHY is used.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 .../common/include/arch/x86/rte_memcpy.h           | 116 ++++++---------------
 1 file changed, 30 insertions(+), 86 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index f463ab3..413035e 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 }
 
 /**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
-/**
- * Copy 64-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-	__m256i ymm0, ymm1;
-
-	while (n >= 64) {
-		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 64;
-		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
-		src = (const uint8_t *)src + 64;
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
-		dst = (uint8_t *)dst + 64;
-	}
-}
-
-/**
- * Copy 256-byte blocks from one location to another,
+ * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
 static inline void
-rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
-	__m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+	__m256i ymm0, ymm1, ymm2, ymm3;
 
-	while (n >= 256) {
+	while (n >= 128) {
 		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 256;
+		n -= 128;
 		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
-		ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
-		ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
-		ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
-		ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
-		src = (const uint8_t *)src + 256;
+		src = (const uint8_t *)src + 128;
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
-		dst = (uint8_t *)dst + 256;
+		dst = (uint8_t *)dst + 128;
 	}
 }
 
@@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n)
 	}
 
 	/**
-	 * Fast way when copy size doesn't exceed 512 bytes
+	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
-	if (n <= 512) {
-		if (n >= 256) {
-			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 256;
-			dst = (uint8_t *)dst + 256;
-		}
+	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
+COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
-COPY_BLOCK_64_BACK31:
 		if (n > 32) {
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
 	}
 
 	/**
-	 * Make store aligned when copy size exceeds 512 bytes
+	 * Make store aligned when copy size exceeds 256 bytes
 	 */
 	dstofss = (uintptr_t)dst & 0x1F;
 	if (dstofss > 0) {
@@ -522,35 +482,19 @@ COPY_BLOCK_64_BACK31:
 	}
 
 	/**
-	 * Copy 256-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
+	 * Copy 128-byte blocks
 	 */
-	rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
-	n = n & 255;
+	n = n & 127;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 
 	/**
-	 * Copy 64-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
-	 */
-	if (n >= 64) {
-		rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
-		bits = n;
-		n = n & 63;
-		bits -= n;
-		src = (const uint8_t *)src + bits;
-		dst = (uint8_t *)dst + bits;
-	}
-
-	/**
 	 * Copy whatever left
 	 */
-	goto COPY_BLOCK_64_BACK31;
+	goto COPY_BLOCK_128_BACK31;
 }
 
 #else /* RTE_MACHINE_CPUFLAG */
-- 
2.5.0

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw
  2016-05-25  1:23 [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw Zhihong Wang
@ 2016-05-26  5:19 ` Xu, Qian Q
  2016-05-26  9:36   ` Wang, Zhihong
  2016-06-15 14:21 ` Thomas Monjalon
  1 sibling, 1 reply; 4+ messages in thread
From: Xu, Qian Q @ 2016-05-26  5:19 UTC (permalink / raw)
  To: Wang, Zhihong, dev; +Cc: Wang, Zhihong

Tested-by: Qian Xu <qian.q.xu@intel.com>

- Test Commit: 8f6f24342281f59de0df7bd976a32f714d39b9a9
- OS/Kernel: Fedora 21/4.1.13
- GCC: gcc (GCC) 4.9.2 20141101 (Red Hat 4.9.2-1)
- CPU: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10
- Total 1 cases, 1 passed, 0 failed. 

1. The test scenario is Vhost-Virtio loopback without NIC.
2. Update the packet size in testpmd config file from 64 to 1518
3. Run vhost pmd in testpmd: 
rm -rf vhost-net*
./x86_64-native-linuxapp-gcc/app/testpmd -c 0xc0000 -n 4 --socket-mem 1024,1024 --vdev 'eth_vhost0,iface=vhost-net,queues=1'  -- -i --nb-cores=1
>start

4. launch VM1 with 1 virtio:
taskset -c 20-21 \
/root/qemu-versions/qemu-2.5.0/x86_64-softmmu/qemu-system-x86_64 -name vm1 \
-cpu host -enable-kvm -m 2048 -object memory-backend-file,id=mem,size=2048M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc \
-smp cores=2,sockets=1 -drive file=/home/img/vm1.img  \
-chardev socket,id=char0,path=./vhost-net \
-netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce,queues=1 \
-device virtio-net-pci,mac=52:54:00:00:00:01,netdev=mynet1,mrg_rxbuf=off,mq=on \
-netdev tap,id=ipvm1,ifname=tap3,script=/etc/qemu-ifup -device rtl8139,netdev=ipvm1,id=net0,mac=00:00:00:00:10:01 \
-vnc :2 -daemonize

5. in VM, run testpmd with the virtio:
./testpmd -c 0x3 -n 4 -- -i
>start 
>show port stats all

6. Check the Vhost best RX/TX rate, and record it as #1. 

7. Apply this patch and rerun the case, get another number #2. 

8. Compare #1 with #2, can see ~5% performance increase on BDW-EP CPU server. 
Thanks
Qian

-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhihong Wang
Sent: Wednesday, May 25, 2016 9:23 AM
To: dev@dpdk.org
Cc: Wang, Zhihong
Subject: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw

This patch fixes rte_memcpy performance in Haswell and Broadwell for vhost when copy size larger than 256 bytes.

It is observed that for large copies like 1024/1518 ones, rte_memcpy suffers high ratio of store buffer full issue which causes pipeline to stall in scenarios like vhost enqueue. This can be alleviated by adjusting instruction layout. Note that this issue may not be visible in micro test.

How to reproduce?

PHY-VM-PHY using vhost/virtio or vhost/virtio loop back, with large packets like 1024/1518 bytes ones. Make sure packet generation rate is not the bottleneck if PHY-VM-PHY is used.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 .../common/include/arch/x86/rte_memcpy.h           | 116 ++++++---------------
 1 file changed, 30 insertions(+), 86 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index f463ab3..413035e 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src)  }
 
 /**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src) -{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
-/**
- * Copy 64-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n) -{
-	__m256i ymm0, ymm1;
-
-	while (n >= 64) {
-		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 64;
-		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
-		src = (const uint8_t *)src + 64;
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
-		dst = (uint8_t *)dst + 64;
-	}
-}
-
-/**
- * Copy 256-byte blocks from one location to another,
+ * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
 static inline void
-rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
-	__m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+	__m256i ymm0, ymm1, ymm2, ymm3;
 
-	while (n >= 256) {
+	while (n >= 128) {
 		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 256;
+		n -= 128;
 		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
-		ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
-		ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
-		ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
-		ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
-		src = (const uint8_t *)src + 256;
+		src = (const uint8_t *)src + 128;
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
-		dst = (uint8_t *)dst + 256;
+		dst = (uint8_t *)dst + 128;
 	}
 }
 
@@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n)
 	}
 
 	/**
-	 * Fast way when copy size doesn't exceed 512 bytes
+	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
-	if (n <= 512) {
-		if (n >= 256) {
-			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 256;
-			dst = (uint8_t *)dst + 256;
-		}
+	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
+COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
-COPY_BLOCK_64_BACK31:
 		if (n > 32) {
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
 	}
 
 	/**
-	 * Make store aligned when copy size exceeds 512 bytes
+	 * Make store aligned when copy size exceeds 256 bytes
 	 */
 	dstofss = (uintptr_t)dst & 0x1F;
 	if (dstofss > 0) {
@@ -522,35 +482,19 @@ COPY_BLOCK_64_BACK31:
 	}
 
 	/**
-	 * Copy 256-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
+	 * Copy 128-byte blocks
 	 */
-	rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
-	n = n & 255;
+	n = n & 127;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 
 	/**
-	 * Copy 64-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
-	 */
-	if (n >= 64) {
-		rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
-		bits = n;
-		n = n & 63;
-		bits -= n;
-		src = (const uint8_t *)src + bits;
-		dst = (uint8_t *)dst + bits;
-	}
-
-	/**
 	 * Copy whatever left
 	 */
-	goto COPY_BLOCK_64_BACK31;
+	goto COPY_BLOCK_128_BACK31;
 }
 
 #else /* RTE_MACHINE_CPUFLAG */
--
2.5.0

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw
  2016-05-26  5:19 ` Xu, Qian Q
@ 2016-05-26  9:36   ` Wang, Zhihong
  0 siblings, 0 replies; 4+ messages in thread
From: Wang, Zhihong @ 2016-05-26  9:36 UTC (permalink / raw)
  To: Xu, Qian Q, dev


> -----Original Message-----
> From: Xu, Qian Q
> Sent: Thursday, May 26, 2016 1:19 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: Wang, Zhihong <zhihong.wang@intel.com>
> Subject: RE: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw
> 
> Tested-by: Qian Xu <qian.q.xu@intel.com>
> 
> - Test Commit: 8f6f24342281f59de0df7bd976a32f714d39b9a9
> - OS/Kernel: Fedora 21/4.1.13
> - GCC: gcc (GCC) 4.9.2 20141101 (Red Hat 4.9.2-1)
> - CPU: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10
> - Total 1 cases, 1 passed, 0 failed.
[...]
> 
> 8. Compare #1 with #2, can see ~5% performance increase on BDW-EP CPU
> server.

Thanks Qian! HSW should suffer even more from this issue.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw
  2016-05-25  1:23 [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw Zhihong Wang
  2016-05-26  5:19 ` Xu, Qian Q
@ 2016-06-15 14:21 ` Thomas Monjalon
  1 sibling, 0 replies; 4+ messages in thread
From: Thomas Monjalon @ 2016-06-15 14:21 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev

2016-05-24 21:23, Zhihong Wang:
> This patch fixes rte_memcpy performance in Haswell and Broadwell for
> vhost when copy size larger than 256 bytes.
> 
> It is observed that for large copies like 1024/1518 ones, rte_memcpy
> suffers high ratio of store buffer full issue which causes pipeline
> to stall in scenarios like vhost enqueue. This can be alleviated by
> adjusting instruction layout. Note that this issue may not be visible
> in micro test.
> 
> How to reproduce?
> 
> PHY-VM-PHY using vhost/virtio or vhost/virtio loop back, with large
> packets like 1024/1518 bytes ones. Make sure packet generation rate
> is not the bottleneck if PHY-VM-PHY is used.
> 
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>

Test report: http://dpdk.org/ml/archives/dev/2016-May/039716.html

Applied, thanks

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-06-15 14:21 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-25  1:23 [dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw Zhihong Wang
2016-05-26  5:19 ` Xu, Qian Q
2016-05-26  9:36   ` Wang, Zhihong
2016-06-15 14:21 ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).