DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 0/3] Async vhost packed ring optimization
@ 2022-12-20  0:44 Cheng Jiang
  2022-12-20  0:44 ` [PATCH 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Cheng Jiang @ 2022-12-20  0:44 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, yvonnex.yang,
	xingguang.he, Cheng Jiang

To improve the performance of async vhost packed ring. We remove the
unnecessary data copy in async vhost packed ring. And add the batch
data path in both enqueue data path and dequeue data path.

Cheng Jiang (3):
  vhost: remove redundant copy for packed shadow used ring
  vhost: add batch enqueue in async vhost packed ring
  vhost: add batch dequeue in async vhost packed ring

 lib/vhost/virtio_net.c | 393 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 355 insertions(+), 38 deletions(-)

--
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] vhost: remove redundant copy for packed shadow used ring
  2022-12-20  0:44 [PATCH 0/3] Async vhost packed ring optimization Cheng Jiang
@ 2022-12-20  0:44 ` Cheng Jiang
  2022-12-20  0:44 ` [PATCH 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Cheng Jiang @ 2022-12-20  0:44 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, yvonnex.yang,
	xingguang.he, Cheng Jiang

In the packed ring enqueue data path of the current asynchronous
Vhost design, the shadow used ring is first copied to the sync
shadow used ring, and then it will be moved to the async shadow
used ring for some historical reasons. This is completely unnecessary.
This patch removes redundant copy for the shadow used ring. The async
shadow used ring will be updated directly.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
---
 lib/vhost/virtio_net.c | 66 ++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 9abf752f30..7c3ec128a0 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -572,6 +572,26 @@ vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 	}
 }
 
+static __rte_always_inline void
+vhost_async_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	uint16_t i;
+	struct vhost_async *async = vq->async;
+
+	for (i = 0; i < num_buffers; i++) {
+		async->buffers_packed[async->buffer_idx_packed].id  = id[i];
+		async->buffers_packed[async->buffer_idx_packed].len = len[i];
+		async->buffers_packed[async->buffer_idx_packed].count = count[i];
+		async->buffer_idx_packed++;
+		if (async->buffer_idx_packed >= vq->size)
+			async->buffer_idx_packed -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 				   struct vhost_virtqueue *vq,
@@ -1647,23 +1667,6 @@ store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem
 	}
 }
 
-static __rte_always_inline void
-store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
-		struct vring_used_elem_packed *d_ring,
-		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
-{
-	size_t elem_size = sizeof(struct vring_used_elem_packed);
-
-	if (d_idx + count <= ring_size) {
-		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
-	} else {
-		uint16_t size = ring_size - d_idx;
-
-		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
-		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
-	}
-}
-
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf **pkts, uint32_t count, int16_t dma_id, uint16_t vchan_id)
@@ -1822,7 +1825,8 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
 	if (unlikely(mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, true) < 0))
 		return -1;
 
-	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
+	vhost_async_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					buffer_desc_count, *nr_buffers);
 
 	return 0;
 }
@@ -1852,6 +1856,7 @@ dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 {
 	uint16_t descs_err = 0;
 	uint16_t buffers_err = 0;
+	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = vq->async->pkts_info;
 
 	*pkt_idx -= nr_err;
@@ -1869,7 +1874,10 @@ dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 		vq->avail_wrap_counter ^= 1;
 	}
 
-	vq->shadow_used_idx -= buffers_err;
+	if (async->buffer_idx_packed >= buffers_err)
+		async->buffer_idx_packed -= buffers_err;
+	else
+		async->buffer_idx_packed = async->buffer_idx_packed + vq->size - buffers_err;
 }
 
 static __rte_noinline uint32_t
@@ -1921,23 +1929,11 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue
 		dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
 	}
 
-	if (likely(vq->shadow_used_idx)) {
-		/* keep used descriptors. */
-		store_dma_desc_info_packed(vq->shadow_used_packed, async->buffers_packed,
-					vq->size, 0, async->buffer_idx_packed,
-					vq->shadow_used_idx);
-
-		async->buffer_idx_packed += vq->shadow_used_idx;
-		if (async->buffer_idx_packed >= vq->size)
-			async->buffer_idx_packed -= vq->size;
-
-		async->pkts_idx += pkt_idx;
-		if (async->pkts_idx >= vq->size)
-			async->pkts_idx -= vq->size;
+	async->pkts_idx += pkt_idx;
+	if (async->pkts_idx >= vq->size)
+		async->pkts_idx -= vq->size;
 
-		vq->shadow_used_idx = 0;
-		async->pkts_inflight_n += pkt_idx;
-	}
+	async->pkts_inflight_n += pkt_idx;
 
 	return pkt_idx;
 }
-- 
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/3] vhost: add batch enqueue in async vhost packed ring
  2022-12-20  0:44 [PATCH 0/3] Async vhost packed ring optimization Cheng Jiang
  2022-12-20  0:44 ` [PATCH 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
@ 2022-12-20  0:44 ` Cheng Jiang
  2022-12-20  0:44 ` [PATCH 3/3] vhost: add batch dequeue " Cheng Jiang
  2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
  3 siblings, 0 replies; 12+ messages in thread
From: Cheng Jiang @ 2022-12-20  0:44 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, yvonnex.yang,
	xingguang.he, Cheng Jiang

Add batch enqueue function in asynchronous vhost packed ring to
improve the performance. Chained mbufs are not supported, it will
be handled in single enqueue function.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
---
 lib/vhost/virtio_net.c | 157 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 7c3ec128a0..ac8c404327 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -432,6 +432,24 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 }
 
+static __rte_always_inline void
+vhost_async_shadow_enqueue_packed_batch(struct vhost_virtqueue *vq,
+				 uint64_t *lens,
+				 uint16_t *ids)
+{
+	uint16_t i;
+	struct vhost_async *async = vq->async;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async->buffers_packed[async->buffer_idx_packed].id  = ids[i];
+		async->buffers_packed[async->buffer_idx_packed].len = lens[i];
+		async->buffers_packed[async->buffer_idx_packed].count = 1;
+		async->buffer_idx_packed++;
+		if (async->buffer_idx_packed >= vq->size)
+			async->buffer_idx_packed -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
 					  uint16_t id)
@@ -1451,6 +1469,58 @@ virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
 	return 0;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_check(struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   uint64_t *desc_addrs,
+			   uint64_t *lens,
+			   int16_t dma_id,
+			   uint16_t vchan_id)
+{
+	bool wrap_counter = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint16_t i;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+
+	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->next != NULL))
+			return -1;
+		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
+					    wrap_counter)))
+			return -1;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		lens[i] = descs[avail_idx + i].len;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
+			return -1;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		desc_addrs[i] =  descs[avail_idx + i].addr;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(!desc_addrs[i]))
+			return -1;
+		if (unlikely(lens[i] != descs[avail_idx + i].len))
+			return -1;
+	}
+
+	if (rte_dma_burst_capacity(dma_id, vchan_id) < PACKED_BATCH_SIZE)
+		return -1;
+
+	return 0;
+}
+
 static __rte_always_inline void
 virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
@@ -1850,6 +1920,78 @@ virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
+static __rte_always_inline void
+virtio_dev_rx_async_packed_batch_enqueue(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   uint64_t *desc_addrs,
+			   uint64_t *lens)
+{
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
+	struct vring_packed_desc *descs = vq->desc_packed;
+	struct vhost_async *async = vq->async;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t mbuf_offset = 0;
+	uint16_t ids[PACKED_BATCH_SIZE];
+	uint64_t mapped_len[PACKED_BATCH_SIZE];
+	void *host_iova[PACKED_BATCH_SIZE];
+	uintptr_t desc;
+	uint16_t i;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+		desc = vhost_iova_to_vva(dev, vq, desc_addrs[i], &lens[i], VHOST_ACCESS_RW);
+		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
+		lens[i] = pkts[i]->pkt_len +
+			sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
+
+	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		host_iova[i] = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+			desc_addrs[i] + buf_offset, lens[i], &mapped_len[i]);
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async_iter_initialize(dev, async);
+		async_iter_add_iovec(dev, async,
+				(void *)(uintptr_t)rte_pktmbuf_iova_offset(pkts[i], mbuf_offset),
+				host_iova[i],
+				mapped_len[i]);
+		async->iter_idx++;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr, lens[i]);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		ids[i] = descs[avail_idx + i].id;
+
+	vhost_async_shadow_enqueue_packed_batch(vq, lens, ids);
+}
+
+static __rte_always_inline int
+virtio_dev_rx_async_packed_batch(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   int16_t dma_id, uint16_t vchan_id)
+{
+	uint64_t desc_addrs[PACKED_BATCH_SIZE];
+	uint64_t lens[PACKED_BATCH_SIZE];
+
+	if (virtio_dev_rx_async_batch_check(vq, pkts, desc_addrs, lens, dma_id, vchan_id) == -1)
+		return -1;
+
+	virtio_dev_rx_async_packed_batch_enqueue(dev, vq, pkts, desc_addrs, lens);
+
+	return 0;
+}
+
 static __rte_always_inline void
 dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 			uint32_t nr_err, uint32_t *pkt_idx)
@@ -1893,10 +2035,25 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue
 	struct async_inflight_info *pkts_info = async->pkts_info;
 	uint32_t pkt_err = 0;
 	uint16_t slot_idx = 0;
+	uint16_t i;
 
 	do {
 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
 
+		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_packed_batch(dev, vq, &pkts[pkt_idx],
+					dma_id, vchan_id)) {
+				for (i = 0; i < PACKED_BATCH_SIZE; i++) {
+					slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
+					pkts_info[slot_idx].descs = 1;
+					pkts_info[slot_idx].nr_buffers = 1;
+					pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+					pkt_idx++;
+				}
+				continue;
+			}
+		}
+
 		num_buffers = 0;
 		num_descs = 0;
 		if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
-- 
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/3] vhost: add batch dequeue in async vhost packed ring
  2022-12-20  0:44 [PATCH 0/3] Async vhost packed ring optimization Cheng Jiang
  2022-12-20  0:44 ` [PATCH 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
  2022-12-20  0:44 ` [PATCH 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
@ 2022-12-20  0:44 ` Cheng Jiang
  2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
  3 siblings, 0 replies; 12+ messages in thread
From: Cheng Jiang @ 2022-12-20  0:44 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, yvonnex.yang,
	xingguang.he, Cheng Jiang

Add batch dequeue function in asynchronous vhost packed ring to
improve the performance. Chained mbufs are not supported, it will
be handled in single dequeue function.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
---
 lib/vhost/virtio_net.c | 170 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 167 insertions(+), 3 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index ac8c404327..9cd69fc7bf 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -450,6 +450,23 @@ vhost_async_shadow_enqueue_packed_batch(struct vhost_virtqueue *vq,
 	}
 }
 
+static __rte_always_inline void
+vhost_async_shadow_dequeue_packed_batch(struct vhost_virtqueue *vq, uint16_t *ids)
+{
+	uint16_t i;
+	struct vhost_async *async = vq->async;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async->buffers_packed[async->buffer_idx_packed].id  = ids[i];
+		async->buffers_packed[async->buffer_idx_packed].len = 0;
+		async->buffers_packed[async->buffer_idx_packed].count = 1;
+
+		async->buffer_idx_packed++;
+		if (async->buffer_idx_packed >= vq->size)
+			async->buffer_idx_packed -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
 					  uint16_t id)
@@ -3193,6 +3210,80 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
 	return -1;
 }
 
+static __rte_always_inline int
+vhost_async_tx_batch_packed_check(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint64_t *lens,
+				 uint16_t *ids,
+				 int16_t dma_id,
+				 uint16_t vchan_id)
+{
+	bool wrap = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint64_t buf_lens[PACKED_BATCH_SIZE];
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint16_t flags, i;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		flags = descs[avail_idx + i].flags;
+		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
+			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
+			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
+			return -1;
+	}
+
+	rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		lens[i] = descs[avail_idx + i].len;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		desc_addrs[i] = descs[avail_idx + i].addr;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(!desc_addrs[i]))
+			return -1;
+		if (unlikely((lens[i] != descs[avail_idx + i].len)))
+			return -1;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
+			goto err;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
+			goto err;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		pkts[i]->pkt_len = lens[i] - buf_offset;
+		pkts[i]->data_len = pkts[i]->pkt_len;
+		ids[i] = descs[avail_idx + i].id;
+	}
+
+	if (rte_dma_burst_capacity(dma_id, vchan_id) < PACKED_BATCH_SIZE)
+		return -1;
+
+	return 0;
+
+err:
+	return -1;
+}
+
 static __rte_always_inline int
 virtio_dev_tx_batch_packed(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
@@ -3769,16 +3860,74 @@ virtio_dev_tx_async_single_packed(struct virtio_net *dev,
 	return err;
 }
 
+static __rte_always_inline int
+virtio_dev_tx_async_packed_batch(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts, uint16_t slot_idx,
+			   uint16_t dma_id, uint16_t vchan_id)
+{
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	struct vhost_async *async = vq->async;
+	struct async_inflight_info *pkts_info = async->pkts_info;
+	struct virtio_net_hdr *hdr;
+	uint32_t mbuf_offset = 0;
+	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
+	uint64_t desc_vva;
+	uint64_t lens[PACKED_BATCH_SIZE];
+	void *host_iova[PACKED_BATCH_SIZE];
+	uint64_t mapped_len[PACKED_BATCH_SIZE];
+	uint16_t ids[PACKED_BATCH_SIZE];
+	uint16_t i;
+
+	if (vhost_async_tx_batch_packed_check(dev, vq, pkts, avail_idx,
+					     desc_addrs, lens, ids, dma_id, vchan_id))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		host_iova[i] = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+			desc_addrs[i] + buf_offset, pkts[i]->pkt_len, &mapped_len[i]);
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async_iter_initialize(dev, async);
+		async_iter_add_iovec(dev, async,
+		host_iova[i],
+		(void *)(uintptr_t)rte_pktmbuf_iova_offset(pkts[i], mbuf_offset),
+		mapped_len[i]);
+		async->iter_idx++;
+	}
+
+	if (virtio_net_with_host_offload(dev)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			desc_vva = vhost_iova_to_vva(dev, vq, desc_addrs[i],
+						&lens[i], VHOST_ACCESS_RO);
+			hdr = (struct virtio_net_hdr *)(uintptr_t)desc_vva;
+			pkts_info[slot_idx + i].nethdr = *hdr;
+		}
+	}
+
+	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
+
+	vhost_async_shadow_dequeue_packed_batch(vq, ids);
+
+	return 0;
+}
+
 static __rte_always_inline uint16_t
 virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
 		uint16_t count, uint16_t dma_id, uint16_t vchan_id, bool legacy_ol_flags)
 {
-	uint16_t pkt_idx;
+	uint32_t pkt_idx = 0;
 	uint16_t slot_idx = 0;
 	uint16_t nr_done_pkts = 0;
 	uint16_t pkt_err = 0;
 	uint32_t n_xfer;
+	uint16_t i;
 	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = async->pkts_info;
 	struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST];
@@ -3790,12 +3939,26 @@ virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count))
 		goto out;
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+	do {
 		struct rte_mbuf *pkt = pkts_prealloc[pkt_idx];
 
 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
 
 		slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
+		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_tx_async_packed_batch(dev, vq, &pkts_prealloc[pkt_idx],
+						slot_idx, dma_id, vchan_id)) {
+				for (i = 0; i < PACKED_BATCH_SIZE; i++) {
+					slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
+					pkts_info[slot_idx].descs = 1;
+					pkts_info[slot_idx].nr_buffers = 1;
+					pkts_info[slot_idx].mbuf = pkts_prealloc[pkt_idx];
+					pkt_idx++;
+				}
+				continue;
+			}
+		}
+
 		if (unlikely(virtio_dev_tx_async_single_packed(dev, vq, mbuf_pool, pkt,
 				slot_idx, legacy_ol_flags))) {
 			rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx);
@@ -3809,7 +3972,8 @@ virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		pkts_info[slot_idx].mbuf = pkt;
-	}
+		pkt_idx++;
+	} while (pkt_idx < count);
 
 	n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
 					async->iov_iter, pkt_idx);
-- 
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 0/3] Async vhost packed ring optimization
  2022-12-20  0:44 [PATCH 0/3] Async vhost packed ring optimization Cheng Jiang
                   ` (2 preceding siblings ...)
  2022-12-20  0:44 ` [PATCH 3/3] vhost: add batch dequeue " Cheng Jiang
@ 2023-01-13  2:56 ` Cheng Jiang
  2023-01-13  2:56   ` [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
                     ` (3 more replies)
  3 siblings, 4 replies; 12+ messages in thread
From: Cheng Jiang @ 2023-01-13  2:56 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

To improve the performance of async vhost packed ring. We remove the
unnecessary data copy in async vhost packed ring. And add the batch
data path in both enqueue data path and dequeue data path.

v2: fixed net header settings.

Cheng Jiang (3):
  vhost: remove redundant copy for packed shadow used ring
  vhost: add batch enqueue in async vhost packed ring
  vhost: add batch dequeue in async vhost packed ring

 lib/vhost/virtio_net.c | 399 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 361 insertions(+), 38 deletions(-)

--
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring
  2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
@ 2023-01-13  2:56   ` Cheng Jiang
  2023-02-02  9:13     ` Maxime Coquelin
  2023-01-13  2:56   ` [PATCH v2 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 12+ messages in thread
From: Cheng Jiang @ 2023-01-13  2:56 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

In the packed ring enqueue data path of the current asynchronous
Vhost design, the shadow used ring is first copied to the sync
shadow used ring, and then it will be moved to the async shadow
used ring for some historical reasons. This is completely unnecessary.
This patch removes redundant copy for the shadow used ring. The async
shadow used ring will be updated directly.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
---
 lib/vhost/virtio_net.c | 66 ++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 9abf752f30..7c3ec128a0 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -572,6 +572,26 @@ vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 	}
 }
 
+static __rte_always_inline void
+vhost_async_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	uint16_t i;
+	struct vhost_async *async = vq->async;
+
+	for (i = 0; i < num_buffers; i++) {
+		async->buffers_packed[async->buffer_idx_packed].id  = id[i];
+		async->buffers_packed[async->buffer_idx_packed].len = len[i];
+		async->buffers_packed[async->buffer_idx_packed].count = count[i];
+		async->buffer_idx_packed++;
+		if (async->buffer_idx_packed >= vq->size)
+			async->buffer_idx_packed -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 				   struct vhost_virtqueue *vq,
@@ -1647,23 +1667,6 @@ store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem
 	}
 }
 
-static __rte_always_inline void
-store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
-		struct vring_used_elem_packed *d_ring,
-		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
-{
-	size_t elem_size = sizeof(struct vring_used_elem_packed);
-
-	if (d_idx + count <= ring_size) {
-		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
-	} else {
-		uint16_t size = ring_size - d_idx;
-
-		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
-		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
-	}
-}
-
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf **pkts, uint32_t count, int16_t dma_id, uint16_t vchan_id)
@@ -1822,7 +1825,8 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
 	if (unlikely(mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, true) < 0))
 		return -1;
 
-	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
+	vhost_async_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					buffer_desc_count, *nr_buffers);
 
 	return 0;
 }
@@ -1852,6 +1856,7 @@ dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 {
 	uint16_t descs_err = 0;
 	uint16_t buffers_err = 0;
+	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = vq->async->pkts_info;
 
 	*pkt_idx -= nr_err;
@@ -1869,7 +1874,10 @@ dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 		vq->avail_wrap_counter ^= 1;
 	}
 
-	vq->shadow_used_idx -= buffers_err;
+	if (async->buffer_idx_packed >= buffers_err)
+		async->buffer_idx_packed -= buffers_err;
+	else
+		async->buffer_idx_packed = async->buffer_idx_packed + vq->size - buffers_err;
 }
 
 static __rte_noinline uint32_t
@@ -1921,23 +1929,11 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue
 		dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
 	}
 
-	if (likely(vq->shadow_used_idx)) {
-		/* keep used descriptors. */
-		store_dma_desc_info_packed(vq->shadow_used_packed, async->buffers_packed,
-					vq->size, 0, async->buffer_idx_packed,
-					vq->shadow_used_idx);
-
-		async->buffer_idx_packed += vq->shadow_used_idx;
-		if (async->buffer_idx_packed >= vq->size)
-			async->buffer_idx_packed -= vq->size;
-
-		async->pkts_idx += pkt_idx;
-		if (async->pkts_idx >= vq->size)
-			async->pkts_idx -= vq->size;
+	async->pkts_idx += pkt_idx;
+	if (async->pkts_idx >= vq->size)
+		async->pkts_idx -= vq->size;
 
-		vq->shadow_used_idx = 0;
-		async->pkts_inflight_n += pkt_idx;
-	}
+	async->pkts_inflight_n += pkt_idx;
 
 	return pkt_idx;
 }
-- 
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 2/3] vhost: add batch enqueue in async vhost packed ring
  2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
  2023-01-13  2:56   ` [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
@ 2023-01-13  2:56   ` Cheng Jiang
  2023-02-02  9:31     ` Maxime Coquelin
  2023-01-13  2:56   ` [PATCH v2 3/3] vhost: add batch dequeue " Cheng Jiang
  2023-02-03 14:59   ` [PATCH v2 0/3] Async vhost packed ring optimization Maxime Coquelin
  3 siblings, 1 reply; 12+ messages in thread
From: Cheng Jiang @ 2023-01-13  2:56 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

Add batch enqueue function in asynchronous vhost packed ring to
improve the performance. Chained mbufs are not supported, it will
be handled in single enqueue function.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
---
 lib/vhost/virtio_net.c | 163 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 7c3ec128a0..aea33ef127 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -432,6 +432,24 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 }
 
+static __rte_always_inline void
+vhost_async_shadow_enqueue_packed_batch(struct vhost_virtqueue *vq,
+				 uint64_t *lens,
+				 uint16_t *ids)
+{
+	uint16_t i;
+	struct vhost_async *async = vq->async;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async->buffers_packed[async->buffer_idx_packed].id  = ids[i];
+		async->buffers_packed[async->buffer_idx_packed].len = lens[i];
+		async->buffers_packed[async->buffer_idx_packed].count = 1;
+		async->buffer_idx_packed++;
+		if (async->buffer_idx_packed >= vq->size)
+			async->buffer_idx_packed -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
 					  uint16_t id)
@@ -1451,6 +1469,58 @@ virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
 	return 0;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_check(struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   uint64_t *desc_addrs,
+			   uint64_t *lens,
+			   int16_t dma_id,
+			   uint16_t vchan_id)
+{
+	bool wrap_counter = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint16_t i;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+
+	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->next != NULL))
+			return -1;
+		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
+					    wrap_counter)))
+			return -1;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		lens[i] = descs[avail_idx + i].len;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
+			return -1;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		desc_addrs[i] =  descs[avail_idx + i].addr;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(!desc_addrs[i]))
+			return -1;
+		if (unlikely(lens[i] != descs[avail_idx + i].len))
+			return -1;
+	}
+
+	if (rte_dma_burst_capacity(dma_id, vchan_id) < PACKED_BATCH_SIZE)
+		return -1;
+
+	return 0;
+}
+
 static __rte_always_inline void
 virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
@@ -1850,6 +1920,84 @@ virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
+static __rte_always_inline void
+virtio_dev_rx_async_packed_batch_enqueue(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   uint64_t *desc_addrs,
+			   uint64_t *lens)
+{
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
+	struct vring_packed_desc *descs = vq->desc_packed;
+	struct vhost_async *async = vq->async;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t mbuf_offset = 0;
+	uint16_t ids[PACKED_BATCH_SIZE];
+	uint64_t mapped_len[PACKED_BATCH_SIZE];
+	void *host_iova[PACKED_BATCH_SIZE];
+	uintptr_t desc;
+	uint16_t i;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+		desc = vhost_iova_to_vva(dev, vq, desc_addrs[i], &lens[i], VHOST_ACCESS_RW);
+		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
+		lens[i] = pkts[i]->pkt_len +
+			sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	}
+
+	if (rxvq_is_mergeable(dev)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			ASSIGN_UNLESS_EQUAL(hdrs[i]->num_buffers, 1);
+		}
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
+
+	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		host_iova[i] = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+			desc_addrs[i] + buf_offset, lens[i], &mapped_len[i]);
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async_iter_initialize(dev, async);
+		async_iter_add_iovec(dev, async,
+				(void *)(uintptr_t)rte_pktmbuf_iova_offset(pkts[i], mbuf_offset),
+				host_iova[i],
+				mapped_len[i]);
+		async->iter_idx++;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr, lens[i]);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		ids[i] = descs[avail_idx + i].id;
+
+	vhost_async_shadow_enqueue_packed_batch(vq, lens, ids);
+}
+
+static __rte_always_inline int
+virtio_dev_rx_async_packed_batch(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   int16_t dma_id, uint16_t vchan_id)
+{
+	uint64_t desc_addrs[PACKED_BATCH_SIZE];
+	uint64_t lens[PACKED_BATCH_SIZE];
+
+	if (virtio_dev_rx_async_batch_check(vq, pkts, desc_addrs, lens, dma_id, vchan_id) == -1)
+		return -1;
+
+	virtio_dev_rx_async_packed_batch_enqueue(dev, vq, pkts, desc_addrs, lens);
+
+	return 0;
+}
+
 static __rte_always_inline void
 dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 			uint32_t nr_err, uint32_t *pkt_idx)
@@ -1893,10 +2041,25 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue
 	struct async_inflight_info *pkts_info = async->pkts_info;
 	uint32_t pkt_err = 0;
 	uint16_t slot_idx = 0;
+	uint16_t i;
 
 	do {
 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
 
+		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_packed_batch(dev, vq, &pkts[pkt_idx],
+					dma_id, vchan_id)) {
+				for (i = 0; i < PACKED_BATCH_SIZE; i++) {
+					slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
+					pkts_info[slot_idx].descs = 1;
+					pkts_info[slot_idx].nr_buffers = 1;
+					pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+					pkt_idx++;
+				}
+				continue;
+			}
+		}
+
 		num_buffers = 0;
 		num_descs = 0;
 		if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
-- 
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 3/3] vhost: add batch dequeue in async vhost packed ring
  2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
  2023-01-13  2:56   ` [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
  2023-01-13  2:56   ` [PATCH v2 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
@ 2023-01-13  2:56   ` Cheng Jiang
  2023-02-02 10:07     ` Maxime Coquelin
  2023-02-03 14:59   ` [PATCH v2 0/3] Async vhost packed ring optimization Maxime Coquelin
  3 siblings, 1 reply; 12+ messages in thread
From: Cheng Jiang @ 2023-01-13  2:56 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

Add batch dequeue function in asynchronous vhost packed ring to
improve the performance. Chained mbufs are not supported, it will
be handled in single dequeue function.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
---
 lib/vhost/virtio_net.c | 170 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 167 insertions(+), 3 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index aea33ef127..8caf05319e 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -450,6 +450,23 @@ vhost_async_shadow_enqueue_packed_batch(struct vhost_virtqueue *vq,
 	}
 }
 
+static __rte_always_inline void
+vhost_async_shadow_dequeue_packed_batch(struct vhost_virtqueue *vq, uint16_t *ids)
+{
+	uint16_t i;
+	struct vhost_async *async = vq->async;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async->buffers_packed[async->buffer_idx_packed].id  = ids[i];
+		async->buffers_packed[async->buffer_idx_packed].len = 0;
+		async->buffers_packed[async->buffer_idx_packed].count = 1;
+
+		async->buffer_idx_packed++;
+		if (async->buffer_idx_packed >= vq->size)
+			async->buffer_idx_packed -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
 					  uint16_t id)
@@ -3199,6 +3216,80 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
 	return -1;
 }
 
+static __rte_always_inline int
+vhost_async_tx_batch_packed_check(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint64_t *lens,
+				 uint16_t *ids,
+				 int16_t dma_id,
+				 uint16_t vchan_id)
+{
+	bool wrap = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint64_t buf_lens[PACKED_BATCH_SIZE];
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint16_t flags, i;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		flags = descs[avail_idx + i].flags;
+		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
+			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
+			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
+			return -1;
+	}
+
+	rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		lens[i] = descs[avail_idx + i].len;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		desc_addrs[i] = descs[avail_idx + i].addr;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(!desc_addrs[i]))
+			return -1;
+		if (unlikely((lens[i] != descs[avail_idx + i].len)))
+			return -1;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
+			goto err;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
+			goto err;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		pkts[i]->pkt_len = lens[i] - buf_offset;
+		pkts[i]->data_len = pkts[i]->pkt_len;
+		ids[i] = descs[avail_idx + i].id;
+	}
+
+	if (rte_dma_burst_capacity(dma_id, vchan_id) < PACKED_BATCH_SIZE)
+		return -1;
+
+	return 0;
+
+err:
+	return -1;
+}
+
 static __rte_always_inline int
 virtio_dev_tx_batch_packed(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
@@ -3775,16 +3866,74 @@ virtio_dev_tx_async_single_packed(struct virtio_net *dev,
 	return err;
 }
 
+static __rte_always_inline int
+virtio_dev_tx_async_packed_batch(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts, uint16_t slot_idx,
+			   uint16_t dma_id, uint16_t vchan_id)
+{
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	struct vhost_async *async = vq->async;
+	struct async_inflight_info *pkts_info = async->pkts_info;
+	struct virtio_net_hdr *hdr;
+	uint32_t mbuf_offset = 0;
+	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
+	uint64_t desc_vva;
+	uint64_t lens[PACKED_BATCH_SIZE];
+	void *host_iova[PACKED_BATCH_SIZE];
+	uint64_t mapped_len[PACKED_BATCH_SIZE];
+	uint16_t ids[PACKED_BATCH_SIZE];
+	uint16_t i;
+
+	if (vhost_async_tx_batch_packed_check(dev, vq, pkts, avail_idx,
+					     desc_addrs, lens, ids, dma_id, vchan_id))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		host_iova[i] = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+			desc_addrs[i] + buf_offset, pkts[i]->pkt_len, &mapped_len[i]);
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		async_iter_initialize(dev, async);
+		async_iter_add_iovec(dev, async,
+		host_iova[i],
+		(void *)(uintptr_t)rte_pktmbuf_iova_offset(pkts[i], mbuf_offset),
+		mapped_len[i]);
+		async->iter_idx++;
+	}
+
+	if (virtio_net_with_host_offload(dev)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			desc_vva = vhost_iova_to_vva(dev, vq, desc_addrs[i],
+						&lens[i], VHOST_ACCESS_RO);
+			hdr = (struct virtio_net_hdr *)(uintptr_t)desc_vva;
+			pkts_info[slot_idx + i].nethdr = *hdr;
+		}
+	}
+
+	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
+
+	vhost_async_shadow_dequeue_packed_batch(vq, ids);
+
+	return 0;
+}
+
 static __rte_always_inline uint16_t
 virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
 		uint16_t count, uint16_t dma_id, uint16_t vchan_id, bool legacy_ol_flags)
 {
-	uint16_t pkt_idx;
+	uint32_t pkt_idx = 0;
 	uint16_t slot_idx = 0;
 	uint16_t nr_done_pkts = 0;
 	uint16_t pkt_err = 0;
 	uint32_t n_xfer;
+	uint16_t i;
 	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = async->pkts_info;
 	struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST];
@@ -3796,12 +3945,26 @@ virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count))
 		goto out;
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+	do {
 		struct rte_mbuf *pkt = pkts_prealloc[pkt_idx];
 
 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
 
 		slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
+		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_tx_async_packed_batch(dev, vq, &pkts_prealloc[pkt_idx],
+						slot_idx, dma_id, vchan_id)) {
+				for (i = 0; i < PACKED_BATCH_SIZE; i++) {
+					slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
+					pkts_info[slot_idx].descs = 1;
+					pkts_info[slot_idx].nr_buffers = 1;
+					pkts_info[slot_idx].mbuf = pkts_prealloc[pkt_idx];
+					pkt_idx++;
+				}
+				continue;
+			}
+		}
+
 		if (unlikely(virtio_dev_tx_async_single_packed(dev, vq, mbuf_pool, pkt,
 				slot_idx, legacy_ol_flags))) {
 			rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx);
@@ -3815,7 +3978,8 @@ virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		pkts_info[slot_idx].mbuf = pkt;
-	}
+		pkt_idx++;
+	} while (pkt_idx < count);
 
 	n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
 					async->iov_iter, pkt_idx);
-- 
2.35.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring
  2023-01-13  2:56   ` [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
@ 2023-02-02  9:13     ` Maxime Coquelin
  0 siblings, 0 replies; 12+ messages in thread
From: Maxime Coquelin @ 2023-02-02  9:13 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he



On 1/13/23 03:56, Cheng Jiang wrote:
> In the packed ring enqueue data path of the current asynchronous
> Vhost design, the shadow used ring is first copied to the sync
> shadow used ring, and then it will be moved to the async shadow
> used ring for some historical reasons. This is completely unnecessary.
> This patch removes redundant copy for the shadow used ring. The async
> shadow used ring will be updated directly.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> ---
>   lib/vhost/virtio_net.c | 66 ++++++++++++++++++++----------------------
>   1 file changed, 31 insertions(+), 35 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/3] vhost: add batch enqueue in async vhost packed ring
  2023-01-13  2:56   ` [PATCH v2 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
@ 2023-02-02  9:31     ` Maxime Coquelin
  0 siblings, 0 replies; 12+ messages in thread
From: Maxime Coquelin @ 2023-02-02  9:31 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he



On 1/13/23 03:56, Cheng Jiang wrote:
> Add batch enqueue function in asynchronous vhost packed ring to
> improve the performance. Chained mbufs are not supported, it will
> be handled in single enqueue function.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> ---
>   lib/vhost/virtio_net.c | 163 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 163 insertions(+)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add batch dequeue in async vhost packed ring
  2023-01-13  2:56   ` [PATCH v2 3/3] vhost: add batch dequeue " Cheng Jiang
@ 2023-02-02 10:07     ` Maxime Coquelin
  0 siblings, 0 replies; 12+ messages in thread
From: Maxime Coquelin @ 2023-02-02 10:07 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he



On 1/13/23 03:56, Cheng Jiang wrote:
> Add batch dequeue function in asynchronous vhost packed ring to
> improve the performance. Chained mbufs are not supported, it will
> be handled in single dequeue function.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> ---
>   lib/vhost/virtio_net.c | 170 ++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 167 insertions(+), 3 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 0/3] Async vhost packed ring optimization
  2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
                     ` (2 preceding siblings ...)
  2023-01-13  2:56   ` [PATCH v2 3/3] vhost: add batch dequeue " Cheng Jiang
@ 2023-02-03 14:59   ` Maxime Coquelin
  3 siblings, 0 replies; 12+ messages in thread
From: Maxime Coquelin @ 2023-02-03 14:59 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he



On 1/13/23 03:56, Cheng Jiang wrote:
> To improve the performance of async vhost packed ring. We remove the
> unnecessary data copy in async vhost packed ring. And add the batch
> data path in both enqueue data path and dequeue data path.
> 
> v2: fixed net header settings.
> 
> Cheng Jiang (3):
>    vhost: remove redundant copy for packed shadow used ring
>    vhost: add batch enqueue in async vhost packed ring
>    vhost: add batch dequeue in async vhost packed ring
> 
>   lib/vhost/virtio_net.c | 399 +++++++++++++++++++++++++++++++++++++----
>   1 file changed, 361 insertions(+), 38 deletions(-)
> 
> --
> 2.35.1
> 

Applied to dpdk-next-virtio/main.

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-02-03 14:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-20  0:44 [PATCH 0/3] Async vhost packed ring optimization Cheng Jiang
2022-12-20  0:44 ` [PATCH 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
2022-12-20  0:44 ` [PATCH 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
2022-12-20  0:44 ` [PATCH 3/3] vhost: add batch dequeue " Cheng Jiang
2023-01-13  2:56 ` [PATCH v2 0/3] Async vhost packed ring optimization Cheng Jiang
2023-01-13  2:56   ` [PATCH v2 1/3] vhost: remove redundant copy for packed shadow used ring Cheng Jiang
2023-02-02  9:13     ` Maxime Coquelin
2023-01-13  2:56   ` [PATCH v2 2/3] vhost: add batch enqueue in async vhost packed ring Cheng Jiang
2023-02-02  9:31     ` Maxime Coquelin
2023-01-13  2:56   ` [PATCH v2 3/3] vhost: add batch dequeue " Cheng Jiang
2023-02-02 10:07     ` Maxime Coquelin
2023-02-03 14:59   ` [PATCH v2 0/3] Async vhost packed ring optimization Maxime Coquelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).