DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] vhost: support CPU copy for small packets
@ 2022-08-12  6:45 Wenwu Ma
  2022-08-12  7:34 ` [PATCH v2] " Wenwu Ma
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Wenwu Ma @ 2022-08-12  6:45 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, dev
  Cc: jiayu.hu, yinan.wang, xingguang.he, xuan.ding, cheng1.jiang,
	yuanx.wang, Wenwu Ma

Offloading small packets to DMA degrades throughput 10%~20%,
and this is because DMA offloading is not free and DMA is not
good at processing small packets. In addition, control plane
packets are usually small, and assign those packets to DMA will
significantly increase latency, which may cause timeout like
TCP handshake packets. Therefore, this patch use CPU to perform
small copies in vhost.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 lib/vhost/vhost.h      |  6 ++-
 lib/vhost/virtio_net.c | 87 +++++++++++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 40fac3b7c6..b4523175a8 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -142,8 +142,10 @@ struct virtqueue_stats {
  * iovec
  */
 struct vhost_iovec {
-	void *src_addr;
-	void *dst_addr;
+	void *src_iov_addr;
+	void *dst_iov_addr;
+	void *src_virt_addr;
+	void *dst_virt_addr;
 	size_t len;
 };
 
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 35fa4670fd..b3bed93de7 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -26,6 +26,8 @@
 
 #define MAX_BATCH_LEN 256
 
+#define CPU_COPY_THRESHOLD_LEN 256
+
 static __rte_always_inline uint16_t
 async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
@@ -114,29 +116,36 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	int copy_idx = 0;
 	uint32_t nr_segs = pkt->nr_segs;
 	uint16_t i;
+	bool cpu_copy = true;
 
 	if (rte_dma_burst_capacity(dma_id, vchan_id) < nr_segs)
 		return -1;
 
 	for (i = 0; i < nr_segs; i++) {
-		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_addr,
-				(rte_iova_t)iov[i].dst_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
-		/**
-		 * Since all memory is pinned and DMA vChannel
-		 * ring has enough space, failure should be a
-		 * rare case. If failure happens, it means DMA
-		 * device encounters serious errors; in this
-		 * case, please stop async data-path and check
-		 * what has happened to DMA device.
-		 */
-		if (unlikely(copy_idx < 0)) {
-			if (!vhost_async_dma_copy_log) {
-				VHOST_LOG_DATA(dev->ifname, ERR,
-					"DMA copy failed for channel %d:%u\n",
-					dma_id, vchan_id);
-				vhost_async_dma_copy_log = true;
+		if (iov[i].len > CPU_COPY_THRESHOLD_LEN) {
+			copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_iov_addr,
+					(rte_iova_t)iov[i].dst_iov_addr,
+					iov[i].len, RTE_DMA_OP_FLAG_LLC);
+			/**
+			 * Since all memory is pinned and DMA vChannel
+			 * ring has enough space, failure should be a
+			 * rare case. If failure happens, it means DMA
+			 * device encounters serious errors; in this
+			 * case, please stop async data-path and check
+			 * what has happened to DMA device.
+			 */
+			if (unlikely(copy_idx < 0)) {
+				if (!vhost_async_dma_copy_log) {
+					VHOST_LOG_DATA(dev->ifname, ERR,
+						"DMA copy failed for channel %d:%u\n",
+						dma_id, vchan_id);
+					vhost_async_dma_copy_log = true;
+				}
+				return -1;
 			}
-			return -1;
+			cpu_copy = false;
+		} else {
+			rte_memcpy(iov[i].dst_virt_addr, iov[i].src_virt_addr, iov[i].len);
 		}
 	}
 
@@ -144,7 +153,13 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 * Only store packet completion flag address in the last copy's
 	 * slot, and other slots are set to NULL.
 	 */
-	dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] = &vq->async->pkts_cmpl_flag[flag_idx];
+	if (cpu_copy == false) {
+		dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] =
+			&vq->async->pkts_cmpl_flag[flag_idx];
+	} else {
+		vq->async->pkts_cmpl_flag[flag_idx] = true;
+		nr_segs = 0;
+	}
 
 	return nr_segs;
 }
@@ -1008,7 +1023,7 @@ async_iter_initialize(struct virtio_net *dev, struct vhost_async *async)
 
 static __rte_always_inline int
 async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
-		void *src, void *dst, size_t len)
+		void *src_iova, void *dst_iova, void *src_addr, void *dst_addr, size_t len)
 {
 	struct vhost_iov_iter *iter;
 	struct vhost_iovec *iovec;
@@ -1027,8 +1042,10 @@ async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
 	iter = async->iov_iter + async->iter_idx;
 	iovec = async->iovec + async->iovec_idx;
 
-	iovec->src_addr = src;
-	iovec->dst_addr = dst;
+	iovec->src_iov_addr = src_iova;
+	iovec->dst_iov_addr = dst_iova;
+	iovec->src_virt_addr = src_addr;
+	iovec->dst_virt_addr = dst_addr;
 	iovec->len = len;
 
 	iter->nr_segs++;
@@ -1064,12 +1081,13 @@ async_iter_reset(struct vhost_async *async)
 static __rte_always_inline int
 async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf *m, uint32_t mbuf_offset,
-		uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
+		uint64_t buf_iova, uint64_t buf_addr, uint32_t cpy_len, bool to_desc)
 {
 	struct vhost_async *async = vq->async;
 	uint64_t mapped_len;
 	uint32_t buf_offset = 0;
-	void *src, *dst;
+	void *src_iova, *dst_iova;
+	void *src_addr, *dst_addr;
 	void *host_iova;
 
 	while (cpy_len) {
@@ -1083,14 +1101,21 @@ async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		if (to_desc) {
-			src = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
-			dst = host_iova;
+			src_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			dst_iova = host_iova;
+
+			src_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+			dst_addr = (void *)(buf_addr + buf_offset);
 		} else {
-			src = host_iova;
-			dst = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			src_iova = host_iova;
+			dst_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+
+			src_addr = (void *)(buf_addr + buf_offset);
+			dst_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 		}
 
-		if (unlikely(async_iter_add_iovec(dev, async, src, dst, (size_t)mapped_len)))
+		if (unlikely(async_iter_add_iovec(dev, async, src_iova, dst_iova,
+						src_addr, dst_addr, (size_t)mapped_len)))
 			return -1;
 
 		cpy_len -= (uint32_t)mapped_len;
@@ -1239,7 +1264,8 @@ mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, m, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, true) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, true) < 0)
 				goto error;
 		} else {
 			sync_fill_seg(dev, vq, m, mbuf_offset,
@@ -2737,7 +2763,8 @@ desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, cur, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, false) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, false) < 0)
 				goto error;
 		} else if (likely(hdr && cur == m)) {
 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] vhost: support CPU copy for small packets
  2022-08-12  6:45 [PATCH] vhost: support CPU copy for small packets Wenwu Ma
@ 2022-08-12  7:34 ` Wenwu Ma
  2022-08-26  5:31 ` [PATCH v3] " Wenwu Ma
  2022-08-29  0:56 ` [PATCH v4] " Wenwu Ma
  2 siblings, 0 replies; 8+ messages in thread
From: Wenwu Ma @ 2022-08-12  7:34 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, dev
  Cc: jiayu.hu, yinan.wang, xingguang.he, xuan.ding, cheng1.jiang,
	yuanx.wang, Wenwu Ma

Offloading small packets to DMA degrades throughput 10%~20%,
and this is because DMA offloading is not free and DMA is not
good at processing small packets. In addition, control plane
packets are usually small, and assign those packets to DMA will
significantly increase latency, which may cause timeout like
TCP handshake packets. Therefore, this patch use CPU to perform
small copies in vhost.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
v2:
* fix CI build error
---
 lib/vhost/vhost.h      |  6 ++-
 lib/vhost/virtio_net.c | 87 +++++++++++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 40fac3b7c6..b4523175a8 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -142,8 +142,10 @@ struct virtqueue_stats {
  * iovec
  */
 struct vhost_iovec {
-	void *src_addr;
-	void *dst_addr;
+	void *src_iov_addr;
+	void *dst_iov_addr;
+	void *src_virt_addr;
+	void *dst_virt_addr;
 	size_t len;
 };
 
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 35fa4670fd..5ab30facc1 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -26,6 +26,8 @@
 
 #define MAX_BATCH_LEN 256
 
+#define CPU_COPY_THRESHOLD_LEN 256
+
 static __rte_always_inline uint16_t
 async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
@@ -114,29 +116,36 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	int copy_idx = 0;
 	uint32_t nr_segs = pkt->nr_segs;
 	uint16_t i;
+	bool cpu_copy = true;
 
 	if (rte_dma_burst_capacity(dma_id, vchan_id) < nr_segs)
 		return -1;
 
 	for (i = 0; i < nr_segs; i++) {
-		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_addr,
-				(rte_iova_t)iov[i].dst_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
-		/**
-		 * Since all memory is pinned and DMA vChannel
-		 * ring has enough space, failure should be a
-		 * rare case. If failure happens, it means DMA
-		 * device encounters serious errors; in this
-		 * case, please stop async data-path and check
-		 * what has happened to DMA device.
-		 */
-		if (unlikely(copy_idx < 0)) {
-			if (!vhost_async_dma_copy_log) {
-				VHOST_LOG_DATA(dev->ifname, ERR,
-					"DMA copy failed for channel %d:%u\n",
-					dma_id, vchan_id);
-				vhost_async_dma_copy_log = true;
+		if (iov[i].len > CPU_COPY_THRESHOLD_LEN) {
+			copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_iov_addr,
+					(rte_iova_t)iov[i].dst_iov_addr,
+					iov[i].len, RTE_DMA_OP_FLAG_LLC);
+			/**
+			 * Since all memory is pinned and DMA vChannel
+			 * ring has enough space, failure should be a
+			 * rare case. If failure happens, it means DMA
+			 * device encounters serious errors; in this
+			 * case, please stop async data-path and check
+			 * what has happened to DMA device.
+			 */
+			if (unlikely(copy_idx < 0)) {
+				if (!vhost_async_dma_copy_log) {
+					VHOST_LOG_DATA(dev->ifname, ERR,
+						"DMA copy failed for channel %d:%u\n",
+						dma_id, vchan_id);
+					vhost_async_dma_copy_log = true;
+				}
+				return -1;
 			}
-			return -1;
+			cpu_copy = false;
+		} else {
+			rte_memcpy(iov[i].dst_virt_addr, iov[i].src_virt_addr, iov[i].len);
 		}
 	}
 
@@ -144,7 +153,13 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 * Only store packet completion flag address in the last copy's
 	 * slot, and other slots are set to NULL.
 	 */
-	dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] = &vq->async->pkts_cmpl_flag[flag_idx];
+	if (cpu_copy == false) {
+		dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] =
+			&vq->async->pkts_cmpl_flag[flag_idx];
+	} else {
+		vq->async->pkts_cmpl_flag[flag_idx] = true;
+		nr_segs = 0;
+	}
 
 	return nr_segs;
 }
@@ -1008,7 +1023,7 @@ async_iter_initialize(struct virtio_net *dev, struct vhost_async *async)
 
 static __rte_always_inline int
 async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
-		void *src, void *dst, size_t len)
+		void *src_iova, void *dst_iova, void *src_addr, void *dst_addr, size_t len)
 {
 	struct vhost_iov_iter *iter;
 	struct vhost_iovec *iovec;
@@ -1027,8 +1042,10 @@ async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
 	iter = async->iov_iter + async->iter_idx;
 	iovec = async->iovec + async->iovec_idx;
 
-	iovec->src_addr = src;
-	iovec->dst_addr = dst;
+	iovec->src_iov_addr = src_iova;
+	iovec->dst_iov_addr = dst_iova;
+	iovec->src_virt_addr = src_addr;
+	iovec->dst_virt_addr = dst_addr;
 	iovec->len = len;
 
 	iter->nr_segs++;
@@ -1064,12 +1081,13 @@ async_iter_reset(struct vhost_async *async)
 static __rte_always_inline int
 async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf *m, uint32_t mbuf_offset,
-		uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
+		uint64_t buf_iova, uint64_t buf_addr, uint32_t cpy_len, bool to_desc)
 {
 	struct vhost_async *async = vq->async;
 	uint64_t mapped_len;
 	uint32_t buf_offset = 0;
-	void *src, *dst;
+	void *src_iova, *dst_iova;
+	void *src_addr, *dst_addr;
 	void *host_iova;
 
 	while (cpy_len) {
@@ -1083,14 +1101,21 @@ async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		if (to_desc) {
-			src = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
-			dst = host_iova;
+			src_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			dst_iova = host_iova;
+
+			src_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+			dst_addr = (void *)(uintptr_t)(buf_addr + buf_offset);
 		} else {
-			src = host_iova;
-			dst = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			src_iova = host_iova;
+			dst_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+
+			src_addr = (void *)(uintptr_t)(buf_addr + buf_offset);
+			dst_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 		}
 
-		if (unlikely(async_iter_add_iovec(dev, async, src, dst, (size_t)mapped_len)))
+		if (unlikely(async_iter_add_iovec(dev, async, src_iova, dst_iova,
+						src_addr, dst_addr, (size_t)mapped_len)))
 			return -1;
 
 		cpy_len -= (uint32_t)mapped_len;
@@ -1239,7 +1264,8 @@ mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, m, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, true) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, true) < 0)
 				goto error;
 		} else {
 			sync_fill_seg(dev, vq, m, mbuf_offset,
@@ -2737,7 +2763,8 @@ desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, cur, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, false) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, false) < 0)
 				goto error;
 		} else if (likely(hdr && cur == m)) {
 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3] vhost: support CPU copy for small packets
  2022-08-12  6:45 [PATCH] vhost: support CPU copy for small packets Wenwu Ma
  2022-08-12  7:34 ` [PATCH v2] " Wenwu Ma
@ 2022-08-26  5:31 ` Wenwu Ma
  2022-08-29  0:56 ` [PATCH v4] " Wenwu Ma
  2 siblings, 0 replies; 8+ messages in thread
From: Wenwu Ma @ 2022-08-26  5:31 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, dev
  Cc: sunil.pai.g, jiayu.hu, yinan.wang, xingguang.he, xuan.ding,
	cheng1.jiang, yuanx.wang, Wenwu Ma

Offloading small packets to DMA degrades throughput 10%~20%,
and this is because DMA offloading is not free and DMA is not
good at processing small packets. In addition, control plane
packets are usually small, and assign those packets to DMA will
significantly increase latency, which may cause timeout like
TCP handshake packets. Therefore, this patch use CPU to perform
small copies in vhost.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
v3:
* compare threshold with entire packet length
v2:
* fix CI build error
---
 lib/vhost/vhost.h      |  7 ++--
 lib/vhost/virtio_net.c | 72 ++++++++++++++++++++++++++++++++----------
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 40fac3b7c6..8a7d90f737 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -142,8 +142,10 @@ struct virtqueue_stats {
  * iovec
  */
 struct vhost_iovec {
-	void *src_addr;
-	void *dst_addr;
+	void *src_iov_addr;
+	void *dst_iov_addr;
+	void *src_virt_addr;
+	void *dst_virt_addr;
 	size_t len;
 };
 
@@ -155,6 +157,7 @@ struct vhost_iov_iter {
 	struct vhost_iovec *iov;
 	/** number of iovec in this iterator */
 	unsigned long nr_segs;
+	unsigned long nr_len;
 };
 
 struct async_dma_vchan_info {
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 35fa4670fd..2b18c908fd 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -26,6 +26,8 @@
 
 #define MAX_BATCH_LEN 256
 
+#define CPU_COPY_THRESHOLD_LEN 256
+
 static __rte_always_inline uint16_t
 async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
@@ -119,8 +121,8 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		return -1;
 
 	for (i = 0; i < nr_segs; i++) {
-		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_addr,
-				(rte_iova_t)iov[i].dst_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
+		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_iov_addr,
+				(rte_iova_t)iov[i].dst_iov_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
 		/**
 		 * Since all memory is pinned and DMA vChannel
 		 * ring has enough space, failure should be a
@@ -149,6 +151,21 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return nr_segs;
 }
 
+static __rte_always_inline int64_t
+vhost_async_cpu_transfer_one(struct vhost_virtqueue *vq, uint16_t flag_idx,
+		struct vhost_iov_iter *pkt)
+{
+	struct vhost_iovec *iov = pkt->iov;
+	uint32_t nr_segs = pkt->nr_segs;
+
+	for (uint16_t i = 0; i < nr_segs; i++)
+		rte_memcpy(iov[i].dst_virt_addr, iov[i].src_virt_addr, iov[i].len);
+
+	vq->async->pkts_cmpl_flag[flag_idx] = true;
+
+	return 0;
+}
+
 static __rte_always_inline uint16_t
 vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		int16_t dma_id, uint16_t vchan_id, uint16_t head_idx,
@@ -161,8 +178,13 @@ vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	rte_spinlock_lock(&dma_info->dma_lock);
 
 	for (pkt_idx = 0; pkt_idx < nr_pkts; pkt_idx++) {
-		ret = vhost_async_dma_transfer_one(dev, vq, dma_id, vchan_id, head_idx,
-				&pkts[pkt_idx]);
+		if (pkts[pkt_idx].nr_len > CPU_COPY_THRESHOLD_LEN) {
+			ret = vhost_async_dma_transfer_one(dev, vq, dma_id, vchan_id, head_idx,
+					&pkts[pkt_idx]);
+		} else {
+			ret = vhost_async_cpu_transfer_one(vq, head_idx, &pkts[pkt_idx]);
+		}
+
 		if (unlikely(ret < 0))
 			break;
 
@@ -1002,13 +1024,14 @@ async_iter_initialize(struct virtio_net *dev, struct vhost_async *async)
 	iter = async->iov_iter + async->iter_idx;
 	iter->iov = async->iovec + async->iovec_idx;
 	iter->nr_segs = 0;
+	iter->nr_len = 0;
 
 	return 0;
 }
 
 static __rte_always_inline int
 async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
-		void *src, void *dst, size_t len)
+		void *src_iova, void *dst_iova, void *src_addr, void *dst_addr, size_t len)
 {
 	struct vhost_iov_iter *iter;
 	struct vhost_iovec *iovec;
@@ -1027,8 +1050,10 @@ async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
 	iter = async->iov_iter + async->iter_idx;
 	iovec = async->iovec + async->iovec_idx;
 
-	iovec->src_addr = src;
-	iovec->dst_addr = dst;
+	iovec->src_iov_addr = src_iova;
+	iovec->dst_iov_addr = dst_iova;
+	iovec->src_virt_addr = src_addr;
+	iovec->dst_virt_addr = dst_addr;
 	iovec->len = len;
 
 	iter->nr_segs++;
@@ -1051,6 +1076,7 @@ async_iter_cancel(struct vhost_async *async)
 	iter = async->iov_iter + async->iter_idx;
 	async->iovec_idx -= iter->nr_segs;
 	iter->nr_segs = 0;
+	iter->nr_len = 0;
 	iter->iov = NULL;
 }
 
@@ -1064,13 +1090,18 @@ async_iter_reset(struct vhost_async *async)
 static __rte_always_inline int
 async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf *m, uint32_t mbuf_offset,
-		uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
+		uint64_t buf_iova, uint64_t buf_addr, uint32_t cpy_len, bool to_desc)
 {
 	struct vhost_async *async = vq->async;
 	uint64_t mapped_len;
 	uint32_t buf_offset = 0;
-	void *src, *dst;
+	void *src_iova, *dst_iova;
+	void *src_addr, *dst_addr;
 	void *host_iova;
+	struct vhost_iov_iter *iter;
+
+	iter = async->iov_iter + async->iter_idx;
+	iter->nr_len += cpy_len;
 
 	while (cpy_len) {
 		host_iova = (void *)(uintptr_t)gpa_to_first_hpa(dev,
@@ -1083,14 +1114,21 @@ async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		if (to_desc) {
-			src = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
-			dst = host_iova;
+			src_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			dst_iova = host_iova;
+
+			src_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+			dst_addr = (void *)(uintptr_t)(buf_addr + buf_offset);
 		} else {
-			src = host_iova;
-			dst = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			src_iova = host_iova;
+			dst_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+
+			src_addr = (void *)(uintptr_t)(buf_addr + buf_offset);
+			dst_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 		}
 
-		if (unlikely(async_iter_add_iovec(dev, async, src, dst, (size_t)mapped_len)))
+		if (unlikely(async_iter_add_iovec(dev, async, src_iova, dst_iova,
+						src_addr, dst_addr, (size_t)mapped_len)))
 			return -1;
 
 		cpy_len -= (uint32_t)mapped_len;
@@ -1239,7 +1277,8 @@ mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, m, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, true) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, true) < 0)
 				goto error;
 		} else {
 			sync_fill_seg(dev, vq, m, mbuf_offset,
@@ -2737,7 +2776,8 @@ desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, cur, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, false) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, false) < 0)
 				goto error;
 		} else if (likely(hdr && cur == m)) {
 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v4] vhost: support CPU copy for small packets
  2022-08-12  6:45 [PATCH] vhost: support CPU copy for small packets Wenwu Ma
  2022-08-12  7:34 ` [PATCH v2] " Wenwu Ma
  2022-08-26  5:31 ` [PATCH v3] " Wenwu Ma
@ 2022-08-29  0:56 ` Wenwu Ma
  2022-09-07 14:47   ` Morten Brørup
  2024-10-04  2:22   ` Stephen Hemminger
  2 siblings, 2 replies; 8+ messages in thread
From: Wenwu Ma @ 2022-08-29  0:56 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, dev
  Cc: sunil.pai.g, jiayu.hu, yinan.wang, xingguang.he, xuan.ding,
	cheng1.jiang, yuanx.wang, Wenwu Ma

Offloading small packets to DMA degrades throughput 10%~20%,
and this is because DMA offloading is not free and DMA is not
good at processing small packets. In addition, control plane
packets are usually small, and assign those packets to DMA will
significantly increase latency, which may cause timeout like
TCP handshake packets. Therefore, this patch use CPU to perform
small copies in vhost.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
v4:
* fix coding style issues
v3:
* compare threshold with entire packet length
v2:
* fix CI build error
---
 lib/vhost/vhost.h      |  7 ++--
 lib/vhost/virtio_net.c | 73 +++++++++++++++++++++++++++++++++---------
 2 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 40fac3b7c6..8a7d90f737 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -142,8 +142,10 @@ struct virtqueue_stats {
  * iovec
  */
 struct vhost_iovec {
-	void *src_addr;
-	void *dst_addr;
+	void *src_iov_addr;
+	void *dst_iov_addr;
+	void *src_virt_addr;
+	void *dst_virt_addr;
 	size_t len;
 };
 
@@ -155,6 +157,7 @@ struct vhost_iov_iter {
 	struct vhost_iovec *iov;
 	/** number of iovec in this iterator */
 	unsigned long nr_segs;
+	unsigned long nr_len;
 };
 
 struct async_dma_vchan_info {
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 35fa4670fd..cf796183a0 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -26,6 +26,8 @@
 
 #define MAX_BATCH_LEN 256
 
+#define CPU_COPY_THRESHOLD_LEN 256
+
 static __rte_always_inline uint16_t
 async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
@@ -119,8 +121,8 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		return -1;
 
 	for (i = 0; i < nr_segs; i++) {
-		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_addr,
-				(rte_iova_t)iov[i].dst_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
+		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_iov_addr,
+				(rte_iova_t)iov[i].dst_iov_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
 		/**
 		 * Since all memory is pinned and DMA vChannel
 		 * ring has enough space, failure should be a
@@ -149,6 +151,22 @@ vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return nr_segs;
 }
 
+static __rte_always_inline int64_t
+vhost_async_cpu_transfer_one(struct vhost_virtqueue *vq, uint16_t flag_idx,
+		struct vhost_iov_iter *pkt)
+{
+	uint16_t i;
+	struct vhost_iovec *iov = pkt->iov;
+	uint32_t nr_segs = pkt->nr_segs;
+
+	for (i = 0; i < nr_segs; i++)
+		rte_memcpy(iov[i].dst_virt_addr, iov[i].src_virt_addr, iov[i].len);
+
+	vq->async->pkts_cmpl_flag[flag_idx] = true;
+
+	return 0;
+}
+
 static __rte_always_inline uint16_t
 vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		int16_t dma_id, uint16_t vchan_id, uint16_t head_idx,
@@ -161,8 +179,13 @@ vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	rte_spinlock_lock(&dma_info->dma_lock);
 
 	for (pkt_idx = 0; pkt_idx < nr_pkts; pkt_idx++) {
-		ret = vhost_async_dma_transfer_one(dev, vq, dma_id, vchan_id, head_idx,
-				&pkts[pkt_idx]);
+		if (pkts[pkt_idx].nr_len > CPU_COPY_THRESHOLD_LEN) {
+			ret = vhost_async_dma_transfer_one(dev, vq, dma_id, vchan_id, head_idx,
+					&pkts[pkt_idx]);
+		} else {
+			ret = vhost_async_cpu_transfer_one(vq, head_idx, &pkts[pkt_idx]);
+		}
+
 		if (unlikely(ret < 0))
 			break;
 
@@ -1002,13 +1025,14 @@ async_iter_initialize(struct virtio_net *dev, struct vhost_async *async)
 	iter = async->iov_iter + async->iter_idx;
 	iter->iov = async->iovec + async->iovec_idx;
 	iter->nr_segs = 0;
+	iter->nr_len = 0;
 
 	return 0;
 }
 
 static __rte_always_inline int
 async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
-		void *src, void *dst, size_t len)
+		void *src_iova, void *dst_iova, void *src_addr, void *dst_addr, size_t len)
 {
 	struct vhost_iov_iter *iter;
 	struct vhost_iovec *iovec;
@@ -1027,8 +1051,10 @@ async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
 	iter = async->iov_iter + async->iter_idx;
 	iovec = async->iovec + async->iovec_idx;
 
-	iovec->src_addr = src;
-	iovec->dst_addr = dst;
+	iovec->src_iov_addr = src_iova;
+	iovec->dst_iov_addr = dst_iova;
+	iovec->src_virt_addr = src_addr;
+	iovec->dst_virt_addr = dst_addr;
 	iovec->len = len;
 
 	iter->nr_segs++;
@@ -1051,6 +1077,7 @@ async_iter_cancel(struct vhost_async *async)
 	iter = async->iov_iter + async->iter_idx;
 	async->iovec_idx -= iter->nr_segs;
 	iter->nr_segs = 0;
+	iter->nr_len = 0;
 	iter->iov = NULL;
 }
 
@@ -1064,13 +1091,18 @@ async_iter_reset(struct vhost_async *async)
 static __rte_always_inline int
 async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf *m, uint32_t mbuf_offset,
-		uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
+		uint64_t buf_iova, uint64_t buf_addr, uint32_t cpy_len, bool to_desc)
 {
 	struct vhost_async *async = vq->async;
 	uint64_t mapped_len;
 	uint32_t buf_offset = 0;
-	void *src, *dst;
+	void *src_iova, *dst_iova;
+	void *src_addr, *dst_addr;
 	void *host_iova;
+	struct vhost_iov_iter *iter;
+
+	iter = async->iov_iter + async->iter_idx;
+	iter->nr_len += cpy_len;
 
 	while (cpy_len) {
 		host_iova = (void *)(uintptr_t)gpa_to_first_hpa(dev,
@@ -1083,14 +1115,21 @@ async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		if (to_desc) {
-			src = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
-			dst = host_iova;
+			src_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			dst_iova = host_iova;
+
+			src_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+			dst_addr = (void *)(uintptr_t)(buf_addr + buf_offset);
 		} else {
-			src = host_iova;
-			dst = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+			src_iova = host_iova;
+			dst_iova = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
+
+			src_addr = (void *)(uintptr_t)(buf_addr + buf_offset);
+			dst_addr = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 		}
 
-		if (unlikely(async_iter_add_iovec(dev, async, src, dst, (size_t)mapped_len)))
+		if (unlikely(async_iter_add_iovec(dev, async, src_iova, dst_iova,
+						src_addr, dst_addr, (size_t)mapped_len)))
 			return -1;
 
 		cpy_len -= (uint32_t)mapped_len;
@@ -1239,7 +1278,8 @@ mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, m, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, true) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, true) < 0)
 				goto error;
 		} else {
 			sync_fill_seg(dev, vq, m, mbuf_offset,
@@ -2737,7 +2777,8 @@ desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		if (is_async) {
 			if (async_fill_seg(dev, vq, cur, mbuf_offset,
-					   buf_iova + buf_offset, cpy_len, false) < 0)
+					   buf_iova + buf_offset, buf_addr + buf_offset,
+					   cpy_len, false) < 0)
 				goto error;
 		} else if (likely(hdr && cur == m)) {
 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v4] vhost: support CPU copy for small packets
  2022-08-29  0:56 ` [PATCH v4] " Wenwu Ma
@ 2022-09-07 14:47   ` Morten Brørup
  2022-09-27  7:32     ` Ma, WenwuX
  2024-10-04  2:22   ` Stephen Hemminger
  1 sibling, 1 reply; 8+ messages in thread
From: Morten Brørup @ 2022-09-07 14:47 UTC (permalink / raw)
  To: Wenwu Ma, maxime.coquelin, chenbo.xia, dev, Bruce Richardson
  Cc: sunil.pai.g, jiayu.hu, yinan.wang, xingguang.he, xuan.ding,
	cheng1.jiang, yuanx.wang

> From: Wenwu Ma [mailto:wenwux.ma@intel.com]
> Sent: Monday, 29 August 2022 02.57
> 
> Offloading small packets to DMA degrades throughput 10%~20%,
> and this is because DMA offloading is not free and DMA is not
> good at processing small packets. In addition, control plane
> packets are usually small, and assign those packets to DMA will
> significantly increase latency, which may cause timeout like
> TCP handshake packets. Therefore, this patch use CPU to perform
> small copies in vhost.
> 
> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> ---

[...]

> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 35fa4670fd..cf796183a0 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -26,6 +26,8 @@
> 
>  #define MAX_BATCH_LEN 256
> 
> +#define CPU_COPY_THRESHOLD_LEN 256

This threshold may not be optimal for all CPU architectures and/or DMA engines.

Could you please provide a test application to compare the performance of DMA copy with CPU rte_memcpy?

The performance metric should be simple: How many cycles does the CPU spend copying various packet sizes using each the two methods.

You could provide test_dmadev_perf.c in addition to the existing test_dmadev.c.
You can probably copy a some of the concepts and code from test_memcpy_perf.c.
Alternatively, you might be able to add DMA copy to test_memcpy_perf.c.

I'm sorry to push this on you - it should have been done as part of DMAdev development already.

-Morten


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v4] vhost: support CPU copy for small packets
  2022-09-07 14:47   ` Morten Brørup
@ 2022-09-27  7:32     ` Ma, WenwuX
  2022-09-27 10:45       ` Morten Brørup
  0 siblings, 1 reply; 8+ messages in thread
From: Ma, WenwuX @ 2022-09-27  7:32 UTC (permalink / raw)
  To: Morten Brørup, maxime.coquelin, Xia, Chenbo, dev, Richardson, Bruce
  Cc: Pai G, Sunil, Hu, Jiayu, Wang, Yinan, He, Xingguang, Ding, Xuan,
	Jiang, Cheng1, Wang, YuanX



> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: 2022年9月7日 22:47
> To: Ma, WenwuX <wenwux.ma@intel.com>; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: Pai G, Sunil <sunil.pai.g@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>;
> Wang, Yinan <yinan.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Jiang,
> Cheng1 <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
> Subject: RE: [PATCH v4] vhost: support CPU copy for small packets
> 
> > From: Wenwu Ma [mailto:wenwux.ma@intel.com]
> > Sent: Monday, 29 August 2022 02.57
> >
> > Offloading small packets to DMA degrades throughput 10%~20%, and this
> > is because DMA offloading is not free and DMA is not good at
> > processing small packets. In addition, control plane packets are
> > usually small, and assign those packets to DMA will significantly
> > increase latency, which may cause timeout like TCP handshake packets.
> > Therefore, this patch use CPU to perform small copies in vhost.
> >
> > Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> > ---
> 
> [...]
> 
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
> > 35fa4670fd..cf796183a0 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -26,6 +26,8 @@
> >
> >  #define MAX_BATCH_LEN 256
> >
> > +#define CPU_COPY_THRESHOLD_LEN 256
> 
> This threshold may not be optimal for all CPU architectures and/or DMA
> engines.
> 
> Could you please provide a test application to compare the performance of
> DMA copy with CPU rte_memcpy?
> 
> The performance metric should be simple: How many cycles does the CPU
> spend copying various packet sizes using each the two methods.
> 
> You could provide test_dmadev_perf.c in addition to the existing
> test_dmadev.c.
> You can probably copy a some of the concepts and code from
> test_memcpy_perf.c.
> Alternatively, you might be able to add DMA copy to test_memcpy_perf.c.
> 
> I'm sorry to push this on you - it should have been done as part of DMAdev
> development already.
> 
> -Morten

The test application has been supported by the following patch.

http://patchwork.dpdk.org/project/dpdk/patch/20220919113957.52127-1-cheng1.jiang@intel.com/



^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v4] vhost: support CPU copy for small packets
  2022-09-27  7:32     ` Ma, WenwuX
@ 2022-09-27 10:45       ` Morten Brørup
  0 siblings, 0 replies; 8+ messages in thread
From: Morten Brørup @ 2022-09-27 10:45 UTC (permalink / raw)
  To: Ma, WenwuX, maxime.coquelin, Xia, Chenbo, dev, Richardson, Bruce
  Cc: Pai G, Sunil, Hu, Jiayu, Wang, Yinan, He, Xingguang, Ding, Xuan,
	Jiang, Cheng1, Wang, YuanX

> From: Ma, WenwuX [mailto:wenwux.ma@intel.com]
> Sent: Tuesday, 27 September 2022 09.32
> 
> > From: Morten Brørup <mb@smartsharesystems.com>
> > Sent: 2022年9月7日 22:47
> >
> > > From: Wenwu Ma [mailto:wenwux.ma@intel.com]
> > > Sent: Monday, 29 August 2022 02.57
> > >
> > > Offloading small packets to DMA degrades throughput 10%~20%, and
> this
> > > is because DMA offloading is not free and DMA is not good at
> > > processing small packets. In addition, control plane packets are
> > > usually small, and assign those packets to DMA will significantly
> > > increase latency, which may cause timeout like TCP handshake
> packets.
> > > Therefore, this patch use CPU to perform small copies in vhost.
> > >
> > > Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> > > ---
> >
> > [...]
> >
> > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
> > > 35fa4670fd..cf796183a0 100644
> > > --- a/lib/vhost/virtio_net.c
> > > +++ b/lib/vhost/virtio_net.c
> > > @@ -26,6 +26,8 @@
> > >
> > >  #define MAX_BATCH_LEN 256
> > >
> > > +#define CPU_COPY_THRESHOLD_LEN 256
> >
> > This threshold may not be optimal for all CPU architectures and/or
> DMA
> > engines.
> >
> > Could you please provide a test application to compare the
> performance of
> > DMA copy with CPU rte_memcpy?
> >
> > The performance metric should be simple: How many cycles does the CPU
> > spend copying various packet sizes using each the two methods.
> >
> > You could provide test_dmadev_perf.c in addition to the existing
> > test_dmadev.c.
> > You can probably copy a some of the concepts and code from
> > test_memcpy_perf.c.
> > Alternatively, you might be able to add DMA copy to
> test_memcpy_perf.c.
> >
> > I'm sorry to push this on you - it should have been done as part of
> DMAdev
> > development already.
> >
> > -Morten
> 
> The test application has been supported by the following patch.
> 
> http://patchwork.dpdk.org/project/dpdk/patch/20220919113957.52127-1-
> cheng1.jiang@intel.com/
> 

Yes, thank you for reminding me.

Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] vhost: support CPU copy for small packets
  2022-08-29  0:56 ` [PATCH v4] " Wenwu Ma
  2022-09-07 14:47   ` Morten Brørup
@ 2024-10-04  2:22   ` Stephen Hemminger
  1 sibling, 0 replies; 8+ messages in thread
From: Stephen Hemminger @ 2024-10-04  2:22 UTC (permalink / raw)
  To: Wenwu Ma
  Cc: maxime.coquelin, chenbo.xia, dev, sunil.pai.g, jiayu.hu,
	yinan.wang, xingguang.he, xuan.ding, cheng1.jiang, yuanx.wang

On Mon, 29 Aug 2022 08:56:58 +0800
Wenwu Ma <wenwux.ma@intel.com> wrote:

> Offloading small packets to DMA degrades throughput 10%~20%,
> and this is because DMA offloading is not free and DMA is not
> good at processing small packets. In addition, control plane
> packets are usually small, and assign those packets to DMA will
> significantly increase latency, which may cause timeout like
> TCP handshake packets. Therefore, this patch use CPU to perform
> small copies in vhost.
> 
> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> ---
> v4:
> * fix coding style issues
> v3:
> * compare threshold with entire packet length
> v2:
> * fix CI build error
> ---
>  lib/vhost/vhost.h      |  7 ++--
>  lib/vhost/virtio_net.c | 73 +++++++++++++++++++++++++++++++++---------
>  2 files changed, 62 insertions(+), 18 deletions(-)
> 
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 40fac3b7c6..8a7d90f737 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -142,8 +142,10 @@ struct virtqueue_stats {
>   * iovec
>   */
>  struct vhost_iovec {
> -	void *src_addr;
> -	void *dst_addr;
> +	void *src_iov_addr;
> +	void *dst_iov_addr;
> +	void *src_virt_addr;
> +	void *dst_virt_addr;
>  	size_t len;
>  };
>  
> @@ -155,6 +157,7 @@ struct vhost_iov_iter {
>  	struct vhost_iovec *iov;
>  	/** number of iovec in this iterator */
>  	unsigned long nr_segs;
> +	unsigned long nr_len;
>  };
>  
>  struct async_dma_vchan_info {
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 35fa4670fd..cf796183a0 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -26,6 +26,8 @@
>  
>  #define MAX_BATCH_LEN 256
>  

> +#define CPU_COPY_THRESHOLD_LEN 256

Good idea.
This heuristic matches what Linux network drivers do to avoid
creating large buffers from small packets.

Patch no longer applies cleanly to main branch (after 2 years).

Also the copy threshold should be configurable via dev_args to allow
for testing and CPU differences.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-10-04  2:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-12  6:45 [PATCH] vhost: support CPU copy for small packets Wenwu Ma
2022-08-12  7:34 ` [PATCH v2] " Wenwu Ma
2022-08-26  5:31 ` [PATCH v3] " Wenwu Ma
2022-08-29  0:56 ` [PATCH v4] " Wenwu Ma
2022-09-07 14:47   ` Morten Brørup
2022-09-27  7:32     ` Ma, WenwuX
2022-09-27 10:45       ` Morten Brørup
2024-10-04  2:22   ` Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).