DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost
@ 2021-03-17  8:54 Cheng Jiang
  2021-03-22  6:15 ` [dpdk-dev] [PATCH v2] " Cheng Jiang
                   ` (7 more replies)
  0 siblings, 8 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-03-17  8:54 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
enables packed ring in async vhost data path.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  15 +-
 lib/librte_vhost/vhost.h           |   7 +-
 lib/librte_vhost/virtio_net.c      | 449 +++++++++++++++++++++++++++--
 4 files changed, 436 insertions(+), 36 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..29de5df8c 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring*/
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 52ab93d1e..445a9f327 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -1603,9 +1603,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		return -1;
 
 	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1643,10 +1643,17 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
+	}
+
+	if (!vq->async_pkts_info ||
 		!vq->it_pool || !vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 658f6fc28..d6324fbf8 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -206,9 +206,14 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
 	uint16_t async_desc_idx;
+	uint16_t async_packed_buffer_idx;
 	uint16_t last_async_desc_idx;
+	uint16_t last_async_buffer_idx;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 583bf379c..9e798226b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,8 +363,7 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 				   uint32_t len[],
 				   uint16_t id[],
 				   uint16_t count[],
@@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t len[],
+				   uint16_t id[],
+				   uint16_t count[],
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1633,12 +1643,343 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq,
+						avail_idx, &desc_count,
+						buf_vec, &nr_vec,
+						&buf_id, &len,
+						VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
+		src_iovec, dst_iovec, src_it, dst_it) < 0)
+		return -1;
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					   buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec,
+						 nr_descs,
+						 nr_buffers,
+						 src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG,
+				"(%d) failed to get enough desc from vring\n",
+				dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_iov_iter *src_it = it_pool;
+	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct {
+		uint16_t pkt_idx;
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
+						pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						src_iovec, dst_iovec,
+						src_it, dst_it) < 0)) {
+			break;
+		}
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
+			(vq->size - 1);
+		if (src_it->count) {
+			uint16_t from, to;
+
+			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
+			async_pkts_log[num_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			src_iovec += src_it->nr_segs;
+			dst_iovec += dst_it->nr_segs;
+			src_it += 2;
+			dst_it += 2;
+			segs_await += src_it->nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx & (vq->size - 1);
+			if (num_buffers + to <= vq->size) {
+				rte_memcpy(&vq->async_buffers_packed[to],
+					&vq->shadow_used_packed[from],
+					num_buffers *
+					sizeof(struct vring_used_elem_packed));
+			} else {
+				int size = vq->size - to;
+
+				rte_memcpy(&vq->async_buffers_packed[to],
+					&vq->shadow_used_packed[from],
+					size *
+					sizeof(struct vring_used_elem_packed));
+				rte_memcpy(vq->async_buffers_packed,
+					&vq->shadow_used_packed[from +
+					size], (num_buffers - size) *
+					sizeof(struct vring_used_elem_packed));
+			}
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
+			BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, pkt_burst_idx);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			src_it = it_pool;
+			dst_it = it_pool + 1;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid,
+				queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t num_buffers = 0;
+
+		num_async_pkts -= pkt_err;
+		/* calculate the sum of descriptors of DMA-error packets. */
+		while (pkt_err-- > 0) {
+			num_buffers +=
+				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
+			slot_idx--;
+		}
+		vq->async_packed_buffer_idx -= num_buffers;
+		/* recover shadow used ring and available ring */
+		vq->shadow_used_idx -= (vq->last_avail_idx -
+				async_pkts_log[num_async_pkts].last_avail_idx -
+				num_buffers);
+		vq->last_avail_idx =
+			async_pkts_log[num_async_pkts].last_avail_idx;
+		pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
+		num_done_pkts = pkt_idx - num_async_pkts;
+	}
+
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
+static __rte_always_inline void
+vhost_update_used_packed(struct virtio_net *dev,
+				  struct vhost_virtqueue *vq,
+				  struct vring_used_elem_packed *shadow_ring,
+				  uint16_t count)
+{
+	if (count == 0)
+		return;
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+			vhost_log_cache_used_vring(dev, vq,
+					vq->last_used_idx *
+					sizeof(struct vring_packed_desc),
+					sizeof(struct vring_packed_desc));
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
+	vhost_log_cache_sync(dev, vq);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1680,53 +2021,98 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
 		uint16_t nr_copy;
 		uint16_t to;
 
 		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
+		if (vq_is_packed(dev)) {
+			uint16_t nr_left = n_buffers;
+			uint16_t to;
+			do {
+				to = vq->async_packed_buffer_idx &
+								(vq->size - 1);
+				from = vq->last_async_buffer_idx &
+								(vq->size - 1);
+				if (to == from)
+					break;
+				if (to > from) {
+					vhost_update_used_packed(dev, vq,
+						vq->async_buffers_packed + from,
+						to - from);
+					vq->last_async_buffer_idx += to - from;
+					nr_left -= to - from;
+				} else {
+					vhost_update_used_packed(dev, vq,
+						vq->async_buffers_packed + from,
+						vq->size - from);
+					vq->last_async_buffer_idx +=
+								vq->size - from;
+					nr_left -= vq->size - from;
+				}
+			} while (nr_left > 0);
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			uint16_t nr_left = n_descs;
+			do {
+				from = vq->last_async_desc_idx & (vq->size - 1);
+				nr_copy = nr_left + from <= vq->size ? nr_left :
+					vq->size - from;
+				to = vq->last_used_idx & (vq->size - 1);
+
+				if (to + nr_copy <= vq->size) {
+					rte_memcpy(&vq->used->ring[to],
 						&vq->async_descs_split[from],
 						nr_copy *
 						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
+				} else {
+					uint16_t size = vq->size - to;
 
-				rte_memcpy(&vq->used->ring[to],
+					rte_memcpy(&vq->used->ring[to],
 						&vq->async_descs_split[from],
 						size *
 						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
+					rte_memcpy(vq->used->ring,
 						&vq->async_descs_split[from +
 						size], (nr_copy - size) *
 						sizeof(struct vring_used_elem));
-			}
+				}
+
+				vq->last_async_desc_idx += nr_copy;
+				vq->last_used_idx += nr_copy;
+				nr_left -= nr_copy;
+			} while (nr_left > 0);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
 
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1767,9 +2153,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v2] vhost: add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-03-22  6:15 ` Cheng Jiang
  2021-03-24  9:19   ` Liu, Yong
  2021-03-31 14:06 ` [dpdk-dev] [PATCH v3] " Cheng Jiang
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-03-22  6:15 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
enables packed ring in async vhost data path.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  24 +-
 lib/librte_vhost/vhost.h           |   7 +-
 lib/librte_vhost/virtio_net.c      | 447 +++++++++++++++++++++++++++--
 4 files changed, 441 insertions(+), 38 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };

 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 52ab93d1e..51b44d6f2 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -330,15 +330,20 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
 	if (vq->async_pkts_info)
 		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
+	if (vq->async_buffers_packed) {
+		rte_free(vq->async_buffers_packed);
+		vq->async_buffers_packed = NULL;
+	} else {
 		rte_free(vq->async_descs_split);
+		vq->async_descs_split = NULL;
+	}
+
 	if (vq->it_pool)
 		rte_free(vq->it_pool);
 	if (vq->vec_pool)
 		rte_free(vq->vec_pool);

 	vq->async_pkts_info = NULL;
-	vq->async_descs_split = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -1603,9 +1608,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		return -1;

 	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1643,10 +1648,17 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
+	}
+
+	if (!vq->async_pkts_info ||
 		!vq->it_pool || !vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 658f6fc28..d6324fbf8 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -206,9 +206,14 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
 	uint16_t async_desc_idx;
+	uint16_t async_packed_buffer_idx;
 	uint16_t last_async_desc_idx;
+	uint16_t last_async_buffer_idx;

 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 583bf379c..fa8c4f4fe 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,8 +363,7 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }

 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 				   uint32_t len[],
 				   uint16_t id[],
 				   uint16_t count[],
@@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t len[],
+				   uint16_t id[],
+				   uint16_t count[],
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);

 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1633,12 +1643,343 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }

+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq,
+						avail_idx, &desc_count,
+						buf_vec, &nr_vec,
+						&buf_id, &len,
+						VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
+		src_iovec, dst_iovec, src_it, dst_it) < 0)
+		return -1;
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					   buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec,
+						 nr_descs,
+						 nr_buffers,
+						 src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG,
+				"(%d) failed to get enough desc from vring\n",
+				dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_iov_iter *src_it = it_pool;
+	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct {
+		uint16_t pkt_idx;
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
+						pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						src_iovec, dst_iovec,
+						src_it, dst_it) < 0)) {
+			break;
+		}
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
+			(vq->size - 1);
+		if (src_it->count) {
+			uint16_t from, to;
+
+			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
+			async_pkts_log[num_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			src_iovec += src_it->nr_segs;
+			dst_iovec += dst_it->nr_segs;
+			src_it += 2;
+			dst_it += 2;
+			segs_await += src_it->nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx & (vq->size - 1);
+			if (num_buffers + to <= vq->size) {
+				rte_memcpy(&vq->async_buffers_packed[to],
+					&vq->shadow_used_packed[from],
+					num_buffers *
+					sizeof(struct vring_used_elem_packed));
+			} else {
+				int size = vq->size - to;
+
+				rte_memcpy(&vq->async_buffers_packed[to],
+					&vq->shadow_used_packed[from],
+					size *
+					sizeof(struct vring_used_elem_packed));
+				rte_memcpy(vq->async_buffers_packed,
+					&vq->shadow_used_packed[from +
+					size], (num_buffers - size) *
+					sizeof(struct vring_used_elem_packed));
+			}
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
+			BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, pkt_burst_idx);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			src_it = it_pool;
+			dst_it = it_pool + 1;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid,
+				queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t num_buffers = 0;
+
+		num_async_pkts -= pkt_err;
+		/* calculate the sum of descriptors of DMA-error packets. */
+		while (pkt_err-- > 0) {
+			num_buffers +=
+				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
+			slot_idx--;
+		}
+		vq->async_packed_buffer_idx -= num_buffers;
+		/* recover shadow used ring and available ring */
+		vq->shadow_used_idx -= (vq->last_avail_idx -
+				async_pkts_log[num_async_pkts].last_avail_idx -
+				num_buffers);
+		vq->last_avail_idx =
+			async_pkts_log[num_async_pkts].last_avail_idx;
+		pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
+		num_done_pkts = pkt_idx - num_async_pkts;
+	}
+
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
+static __rte_always_inline void
+vhost_update_used_packed(struct virtio_net *dev,
+				  struct vhost_virtqueue *vq,
+				  struct vring_used_elem_packed *shadow_ring,
+				  uint16_t count)
+{
+	if (count == 0)
+		return;
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+			vhost_log_cache_used_vring(dev, vq,
+					vq->last_used_idx *
+					sizeof(struct vring_packed_desc),
+					sizeof(struct vring_packed_desc));
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
+	vhost_log_cache_sync(dev, vq);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1680,53 +2021,96 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}

-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;

 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
 		uint16_t nr_copy;
 		uint16_t to;

 		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
+		if (vq_is_packed(dev)) {
+			uint16_t nr_left = n_buffers;
+			uint16_t to;
+			do {
+				from = vq->last_async_buffer_idx &
+								(vq->size - 1);
+				to = (from + nr_left) & (vq->size - 1);
+
+				if (to > from) {
+					vhost_update_used_packed(dev, vq,
+						vq->async_buffers_packed + from,
+						to - from);
+					vq->last_async_buffer_idx += nr_left;
+					nr_left = 0;
+				} else {
+					vhost_update_used_packed(dev, vq,
+						vq->async_buffers_packed + from,
+						vq->size - from);
+					vq->last_async_buffer_idx +=
+								vq->size - from;
+					nr_left -= vq->size - from;
+				}
+			} while (nr_left > 0);
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			uint16_t nr_left = n_descs;
+			do {
+				from = vq->last_async_desc_idx & (vq->size - 1);
+				nr_copy = nr_left + from <= vq->size ? nr_left :
+					vq->size - from;
+				to = vq->last_used_idx & (vq->size - 1);
+
+				if (to + nr_copy <= vq->size) {
+					rte_memcpy(&vq->used->ring[to],
 						&vq->async_descs_split[from],
 						nr_copy *
 						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
+				} else {
+					uint16_t size = vq->size - to;

-				rte_memcpy(&vq->used->ring[to],
+					rte_memcpy(&vq->used->ring[to],
 						&vq->async_descs_split[from],
 						size *
 						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
+					rte_memcpy(vq->used->ring,
 						&vq->async_descs_split[from +
 						size], (nr_copy - size) *
 						sizeof(struct vring_used_elem));
-			}
+				}
+
+				vq->last_async_desc_idx += nr_copy;
+				vq->last_used_idx += nr_copy;
+				nr_left -= nr_copy;
+			} while (nr_left > 0);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}

-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);

-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}

 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1767,9 +2151,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;

-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: add support for packed ring in async vhost
  2021-03-22  6:15 ` [dpdk-dev] [PATCH v2] " Cheng Jiang
@ 2021-03-24  9:19   ` Liu, Yong
  2021-03-29 12:29     ` Jiang, Cheng1
  0 siblings, 1 reply; 60+ messages in thread
From: Liu, Yong @ 2021-03-24  9:19 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan, Jiang, Cheng1



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Cheng Jiang
> Sent: Monday, March 22, 2021 2:15 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Jiang,
> Cheng1 <cheng1.jiang@intel.com>
> Subject: [dpdk-dev] [PATCH v2] vhost: add support for packed ring in async
> vhost
> 
> For now async vhost data path only supports split ring structure. In
> order to make async vhost compatible with virtio 1.1 spec this patch
> enables packed ring in async vhost data path.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
> v2:
>   * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
>   * add async_buffers_packed memory free in vhost_free_async_mem()
> 
>  lib/librte_vhost/rte_vhost_async.h |   1 +
>  lib/librte_vhost/vhost.c           |  24 +-
>  lib/librte_vhost/vhost.h           |   7 +-
>  lib/librte_vhost/virtio_net.c      | 447 +++++++++++++++++++++++++++--
>  4 files changed, 441 insertions(+), 38 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost_async.h
> b/lib/librte_vhost/rte_vhost_async.h
> index c855ff875..6faa31f5a 100644
> --- a/lib/librte_vhost/rte_vhost_async.h
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
>  struct async_inflight_info {
>  	struct rte_mbuf *mbuf;
>  	uint16_t descs; /* num of descs inflight */
> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>  };
> 
>  /**
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 52ab93d1e..51b44d6f2 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -330,15 +330,20 @@ vhost_free_async_mem(struct vhost_virtqueue
> *vq)
>  {
>  	if (vq->async_pkts_info)
>  		rte_free(vq->async_pkts_info);
> -	if (vq->async_descs_split)
> +	if (vq->async_buffers_packed) {
> +		rte_free(vq->async_buffers_packed);
> +		vq->async_buffers_packed = NULL;
> +	} else {
>  		rte_free(vq->async_descs_split);
> +		vq->async_descs_split = NULL;
> +	}
> +
>  	if (vq->it_pool)
>  		rte_free(vq->it_pool);
>  	if (vq->vec_pool)
>  		rte_free(vq->vec_pool);
> 
>  	vq->async_pkts_info = NULL;
> -	vq->async_descs_split = NULL;
>  	vq->it_pool = NULL;
>  	vq->vec_pool = NULL;
>  }
> @@ -1603,9 +1608,9 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
>  		return -1;
> 
>  	/* packed queue is not supported */
> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> +	if (unlikely(!f.async_inorder)) {
>  		VHOST_LOG_CONFIG(ERR,
> -			"async copy is not supported on packed queue or
> non-inorder mode "
> +			"async copy is not supported on non-inorder mode "
>  			"(vid %d, qid: %d)\n", vid, queue_id);
>  		return -1;
>  	}
> @@ -1643,10 +1648,17 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
>  	vq->vec_pool = rte_malloc_socket(NULL,
>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
>  			RTE_CACHE_LINE_SIZE, node);
> -	vq->async_descs_split = rte_malloc_socket(NULL,
> +	if (vq_is_packed(dev)) {
> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> +			vq->size * sizeof(struct vring_used_elem_packed),
> +			RTE_CACHE_LINE_SIZE, node);
> +	} else {
> +		vq->async_descs_split = rte_malloc_socket(NULL,
>  			vq->size * sizeof(struct vring_used_elem),
>  			RTE_CACHE_LINE_SIZE, node);
> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> +	}
> +
> +	if (!vq->async_pkts_info ||
>  		!vq->it_pool || !vq->vec_pool) {
>  		vhost_free_async_mem(vq);
>  		VHOST_LOG_CONFIG(ERR,
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 658f6fc28..d6324fbf8 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -206,9 +206,14 @@ struct vhost_virtqueue {
>  	uint16_t	async_pkts_idx;
>  	uint16_t	async_pkts_inflight_n;
>  	uint16_t	async_last_pkts_n;
> -	struct vring_used_elem  *async_descs_split;
> +	union {
> +		struct vring_used_elem  *async_descs_split;
> +		struct vring_used_elem_packed *async_buffers_packed;
> +	};
>  	uint16_t async_desc_idx;
> +	uint16_t async_packed_buffer_idx;
>  	uint16_t last_async_desc_idx;
> +	uint16_t last_async_buffer_idx;
> 
>  	/* vq async features */
>  	bool		async_inorder;
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 583bf379c..fa8c4f4fe 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -363,8 +363,7 @@
> vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
>  }
> 
>  static __rte_always_inline void
> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> -				   struct vhost_virtqueue *vq,
> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
>  				   uint32_t len[],
>  				   uint16_t id[],
>  				   uint16_t count[],
> @@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct
> virtio_net *dev,
>  		vq->shadow_aligned_idx += count[i];
>  		vq->shadow_used_idx++;
>  	}
> +}
> +
> +static __rte_always_inline void
> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> +				   struct vhost_virtqueue *vq,
> +				   uint32_t len[],
> +				   uint16_t id[],
> +				   uint16_t count[],
> +				   uint16_t num_buffers)
> +{
> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> 
>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
>  		do_data_copy_enqueue(dev, vq);
> @@ -1633,12 +1643,343 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	return pkt_idx;
>  }
> 
> +static __rte_always_inline int
> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    struct buf_vector *buf_vec,
> +			    uint16_t *nr_descs,
> +			    uint16_t *nr_buffers,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	uint16_t nr_vec = 0;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint16_t max_tries, tries = 0;
> +	uint16_t buf_id = 0;
> +	uint32_t len = 0;
> +	uint16_t desc_count;
> +	uint32_t size = pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> +	uint32_t buffer_len[vq->size];
> +	uint16_t buffer_buf_id[vq->size];
> +	uint16_t buffer_desc_count[vq->size];
> +	*nr_buffers = 0;
> +
> +	if (rxvq_is_mergeable(dev))
> +		max_tries = vq->size - 1;
> +	else
> +		max_tries = 1;
> +
> +	while (size > 0) {
> +		/*
> +		 * if we tried all available ring items, and still
> +		 * can't get enough buf, it means something abnormal
> +		 * happened.
> +		 */
> +		if (unlikely(++tries > max_tries))
> +			return -1;
> +
> +		if (unlikely(fill_vec_buf_packed(dev, vq,
> +						avail_idx, &desc_count,
> +						buf_vec, &nr_vec,
> +						&buf_id, &len,
> +						VHOST_ACCESS_RW) < 0))
> +			return -1;
> +
> +		len = RTE_MIN(len, size);
> +		size -= len;
> +
> +		buffer_len[*nr_buffers] = len;
> +		buffer_buf_id[*nr_buffers] = buf_id;
> +		buffer_desc_count[*nr_buffers] = desc_count;
> +		*nr_buffers += 1;
> +
> +		*nr_descs += desc_count;
> +		avail_idx += desc_count;
> +		if (avail_idx >= vq->size)
> +			avail_idx -= vq->size;
> +	}
> +
> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> +		src_iovec, dst_iovec, src_it, dst_it) < 0)
> +		return -1;
> +
> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> +					   buffer_desc_count, *nr_buffers);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline int16_t
> +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    uint16_t *nr_descs, uint16_t *nr_buffers,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> +	*nr_descs = 0;
> +	*nr_buffers = 0;
> +
> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> buf_vec,
> +						 nr_descs,
> +						 nr_buffers,
> +						 src_iovec, dst_iovec,
> +						 src_it, dst_it) < 0)) {
> +		VHOST_LOG_DATA(DEBUG,
> +				"(%d) failed to get enough desc from vring\n",
> +				dev->vid);
> +		return -1;
> +	}
> +
> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> +			dev->vid, vq->last_avail_idx,
> +			vq->last_avail_idx + *nr_descs);
> +
> +	return 0;
> +}
> +
> +static __rte_noinline uint32_t
> +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
> +	struct vhost_virtqueue *vq, uint16_t queue_id,
> +	struct rte_mbuf **pkts, uint32_t count,
> +	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
> +{

Hi Cheng,
There're some common parts in virtio_dev_rx_async_submit_packed and virtio_dev_rx_async_submit_split. 
We can abstract some functions for those common parts which can bring more clarity.

Also this patch may be too huge for reviewing, please separate it into few parts for better understanding. 

Thanks,
Marvin

> +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> +	uint16_t num_buffers;
> +	uint16_t num_desc;
> +
> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> +	struct iovec *vec_pool = vq->vec_pool;
> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> +	struct iovec *src_iovec = vec_pool;
> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> +	struct rte_vhost_iov_iter *src_it = it_pool;
> +	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> +	uint16_t slot_idx = 0;
> +	uint16_t segs_await = 0;
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> +	uint32_t n_pkts = 0, pkt_err = 0;
> +	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> +	struct {
> +		uint16_t pkt_idx;
> +		uint16_t last_avail_idx;
> +	} async_pkts_log[MAX_PKT_BURST];
> +
> +	rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
> +
> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> +		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
> +						pkts[pkt_idx],
> +						&num_desc, &num_buffers,
> +						src_iovec, dst_iovec,
> +						src_it, dst_it) < 0)) {
> +			break;
> +		}
> +
> +		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> +			dev->vid, vq->last_avail_idx,
> +			vq->last_avail_idx + num_desc);
> +
> +		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
> +			(vq->size - 1);
> +		if (src_it->count) {
> +			uint16_t from, to;
> +
> +			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> +			pkts_info[slot_idx].descs = num_desc;
> +			pkts_info[slot_idx].nr_buffers = num_buffers;
> +			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> +			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
> +			async_pkts_log[num_async_pkts++].last_avail_idx =
> +				vq->last_avail_idx;
> +			src_iovec += src_it->nr_segs;
> +			dst_iovec += dst_it->nr_segs;
> +			src_it += 2;
> +			dst_it += 2;
> +			segs_await += src_it->nr_segs;
> +
> +			/**
> +			 * recover shadow used ring and keep DMA-occupied
> +			 * descriptors.
> +			 */
> +			from = vq->shadow_used_idx - num_buffers;
> +			to = vq->async_packed_buffer_idx & (vq->size - 1);
> +			if (num_buffers + to <= vq->size) {
> +				rte_memcpy(&vq->async_buffers_packed[to],
> +					&vq->shadow_used_packed[from],
> +					num_buffers *
> +					sizeof(struct
> vring_used_elem_packed));
> +			} else {
> +				int size = vq->size - to;
> +
> +				rte_memcpy(&vq->async_buffers_packed[to],
> +					&vq->shadow_used_packed[from],
> +					size *
> +					sizeof(struct
> vring_used_elem_packed));
> +				rte_memcpy(vq->async_buffers_packed,
> +					&vq->shadow_used_packed[from +
> +					size], (num_buffers - size) *
> +					sizeof(struct
> vring_used_elem_packed));
> +			}
> +			vq->async_packed_buffer_idx += num_buffers;
> +			vq->shadow_used_idx -= num_buffers;
> +		} else
> +			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> +
> +		vq_inc_last_avail_packed(vq, num_desc);
> +
> +		/*
> +		 * conditions to trigger async device transfer:
> +		 * - buffered packet number reaches transfer threshold
> +		 * - unused async iov number is less than max vhost vector
> +		 */
> +		if (unlikely(pkt_burst_idx >=
> VHOST_ASYNC_BATCH_THRESHOLD ||
> +			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> +			BUF_VECTOR_MAX))) {
> +			n_pkts = vq->async_ops.transfer_data(dev->vid,
> +					queue_id, tdes, 0, pkt_burst_idx);
> +			src_iovec = vec_pool;
> +			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> +			src_it = it_pool;
> +			dst_it = it_pool + 1;
> +			segs_await = 0;
> +			vq->async_pkts_inflight_n += n_pkts;
> +
> +			if (unlikely(n_pkts < pkt_burst_idx)) {
> +				/*
> +				 * log error packets number here and do
> actual
> +				 * error processing when applications poll
> +				 * completion
> +				 */
> +				pkt_err = pkt_burst_idx - n_pkts;
> +				pkt_burst_idx = 0;
> +				break;
> +			}
> +
> +			pkt_burst_idx = 0;
> +		}
> +	}
> +
> +	if (pkt_burst_idx) {
> +		n_pkts = vq->async_ops.transfer_data(dev->vid,
> +				queue_id, tdes, 0, pkt_burst_idx);
> +		vq->async_pkts_inflight_n += n_pkts;
> +
> +		if (unlikely(n_pkts < pkt_burst_idx))
> +			pkt_err = pkt_burst_idx - n_pkts;
> +	}
> +
> +	do_data_copy_enqueue(dev, vq);
> +
> +	if (unlikely(pkt_err)) {
> +		uint16_t num_buffers = 0;
> +
> +		num_async_pkts -= pkt_err;
> +		/* calculate the sum of descriptors of DMA-error packets. */
> +		while (pkt_err-- > 0) {
> +			num_buffers +=
> +				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
> +			slot_idx--;
> +		}
> +		vq->async_packed_buffer_idx -= num_buffers;
> +		/* recover shadow used ring and available ring */
> +		vq->shadow_used_idx -= (vq->last_avail_idx -
> +
> 	async_pkts_log[num_async_pkts].last_avail_idx -
> +				num_buffers);

Could it possible that vq->last_avail_idx smaller than async_pkts_log[num_async_pkts].last_avail_idx when operations near the ring's boundary? 

> +		vq->last_avail_idx =
> +			async_pkts_log[num_async_pkts].last_avail_idx;
> +		pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
> +		num_done_pkts = pkt_idx - num_async_pkts;
> +	}
> +
> +	vq->async_pkts_idx += num_async_pkts;
> +	*comp_count = num_done_pkts;
> +
> +	if (likely(vq->shadow_used_idx)) {
> +		vhost_flush_enqueue_shadow_packed(dev, vq);
> +		vhost_vring_call_packed(dev, vq);
> +	}
> +
> +	return pkt_idx;
> +}
> +
> +static __rte_always_inline void
> +vhost_update_used_packed(struct virtio_net *dev,
> +				  struct vhost_virtqueue *vq,
> +				  struct vring_used_elem_packed
> *shadow_ring,
> +				  uint16_t count)
> +{
> +	if (count == 0)
> +		return;
> +	int i;
> +	uint16_t used_idx = vq->last_used_idx;
> +	uint16_t head_idx = vq->last_used_idx;
> +	uint16_t head_flags = 0;
> +
> +	/* Split loop in two to save memory barriers */
> +	for (i = 0; i < count; i++) {
> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> +
> +		used_idx += shadow_ring[i].count;
> +		if (used_idx >= vq->size)
> +			used_idx -= vq->size;
> +	}
> +
> +	/* The ordering for storing desc flags needs to be enforced. */
> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> +
> +	for (i = 0; i < count; i++) {
> +		uint16_t flags;
> +
> +		if (vq->shadow_used_packed[i].len)
> +			flags = VRING_DESC_F_WRITE;
> +		else
> +			flags = 0;
> +
> +		if (vq->used_wrap_counter) {
> +			flags |= VRING_DESC_F_USED;
> +			flags |= VRING_DESC_F_AVAIL;
> +		} else {
> +			flags &= ~VRING_DESC_F_USED;
> +			flags &= ~VRING_DESC_F_AVAIL;
> +		}
> +
> +		if (i > 0) {
> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> +
> +			vhost_log_cache_used_vring(dev, vq,
> +					vq->last_used_idx *
> +					sizeof(struct vring_packed_desc),
> +					sizeof(struct vring_packed_desc));
> +		} else {
> +			head_idx = vq->last_used_idx;
> +			head_flags = flags;
> +		}
> +
> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> +	}
> +
> +	vq->desc_packed[head_idx].flags = head_flags;
> +
> +	vhost_log_cache_used_vring(dev, vq,
> +				head_idx *
> +				sizeof(struct vring_packed_desc),
> +				sizeof(struct vring_packed_desc));
> +
> +	vhost_log_cache_sync(dev, vq);
> +}
> +
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count)
>  {
>  	struct virtio_net *dev = get_device(vid);
>  	struct vhost_virtqueue *vq;
> -	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
>  	uint16_t start_idx, pkts_idx, vq_size;
>  	struct async_inflight_info *pkts_info;
>  	uint16_t from, i;
> @@ -1680,53 +2021,96 @@ uint16_t
> rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		goto done;
>  	}
> 
> -	for (i = 0; i < n_pkts_put; i++) {
> -		from = (start_idx + i) & (vq_size - 1);
> -		n_descs += pkts_info[from].descs;
> -		pkts[i] = pkts_info[from].mbuf;
> +	if (vq_is_packed(dev)) {
> +		for (i = 0; i < n_pkts_put; i++) {
> +			from = (start_idx + i) & (vq_size - 1);
> +			n_buffers += pkts_info[from].nr_buffers;
> +			pkts[i] = pkts_info[from].mbuf;
> +		}
> +	} else {
> +		for (i = 0; i < n_pkts_put; i++) {
> +			from = (start_idx + i) & (vq_size - 1);
> +			n_descs += pkts_info[from].descs;
> +			pkts[i] = pkts_info[from].mbuf;
> +		}
>  	}
> +
>  	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
>  	vq->async_pkts_inflight_n -= n_pkts_put;
> 
>  	if (likely(vq->enabled && vq->access_ok)) {
> -		uint16_t nr_left = n_descs;
>  		uint16_t nr_copy;
>  		uint16_t to;
> 
>  		/* write back completed descriptors to used ring */
> -		do {
> -			from = vq->last_async_desc_idx & (vq->size - 1);
> -			nr_copy = nr_left + from <= vq->size ? nr_left :
> -				vq->size - from;
> -			to = vq->last_used_idx & (vq->size - 1);
> -
> -			if (to + nr_copy <= vq->size) {
> -				rte_memcpy(&vq->used->ring[to],
> +		if (vq_is_packed(dev)) {
> +			uint16_t nr_left = n_buffers;
> +			uint16_t to;
> +			do {
> +				from = vq->last_async_buffer_idx &
> +								(vq->size - 1);
> +				to = (from + nr_left) & (vq->size - 1);
> +
> +				if (to > from) {
> +					vhost_update_used_packed(dev, vq,
> +						vq->async_buffers_packed +
> from,
> +						to - from);
> +					vq->last_async_buffer_idx += nr_left;
> +					nr_left = 0;
> +				} else {
> +					vhost_update_used_packed(dev, vq,
> +						vq->async_buffers_packed +
> from,
> +						vq->size - from);
> +					vq->last_async_buffer_idx +=
> +								vq->size -
> from;
> +					nr_left -= vq->size - from;
> +				}
> +			} while (nr_left > 0);
> +			vhost_vring_call_packed(dev, vq);
> +		} else {
> +			uint16_t nr_left = n_descs;
> +			do {
> +				from = vq->last_async_desc_idx & (vq->size -
> 1);
> +				nr_copy = nr_left + from <= vq->size ? nr_left :
> +					vq->size - from;
> +				to = vq->last_used_idx & (vq->size - 1);
> +
> +				if (to + nr_copy <= vq->size) {
> +					rte_memcpy(&vq->used->ring[to],
>  						&vq-
> >async_descs_split[from],
>  						nr_copy *
>  						sizeof(struct
> vring_used_elem));
> -			} else {
> -				uint16_t size = vq->size - to;
> +				} else {
> +					uint16_t size = vq->size - to;
> 
> -				rte_memcpy(&vq->used->ring[to],
> +					rte_memcpy(&vq->used->ring[to],
>  						&vq-
> >async_descs_split[from],
>  						size *
>  						sizeof(struct
> vring_used_elem));
> -				rte_memcpy(vq->used->ring,
> +					rte_memcpy(vq->used->ring,
>  						&vq->async_descs_split[from
> +
>  						size], (nr_copy - size) *
>  						sizeof(struct
> vring_used_elem));
> -			}
> +				}
> +
> +				vq->last_async_desc_idx += nr_copy;
> +				vq->last_used_idx += nr_copy;
> +				nr_left -= nr_copy;
> +			} while (nr_left > 0);
> +
> +			__atomic_add_fetch(&vq->used->idx, n_descs,
> +					__ATOMIC_RELEASE);
> +			vhost_vring_call_split(dev, vq);
> +		}
> 
> -			vq->last_async_desc_idx += nr_copy;
> -			vq->last_used_idx += nr_copy;
> -			nr_left -= nr_copy;
> -		} while (nr_left > 0);
> 
> -		__atomic_add_fetch(&vq->used->idx, n_descs,
> __ATOMIC_RELEASE);
> -		vhost_vring_call_split(dev, vq);
> -	} else
> -		vq->last_async_desc_idx += n_descs;
> +
> +	} else {
> +		if (vq_is_packed(dev))
> +			vq->last_async_buffer_idx += n_buffers;
> +		else
> +			vq->last_async_desc_idx += n_descs;
> +	}
> 
>  done:
>  	rte_spinlock_unlock(&vq->access_lock);
> @@ -1767,9 +2151,10 @@ virtio_dev_rx_async_submit(struct virtio_net
> *dev, uint16_t queue_id,
>  	if (count == 0)
>  		goto out;
> 
> -	/* TODO: packed queue not implemented */
>  	if (vq_is_packed(dev))
> -		nb_tx = 0;
> +		nb_tx = virtio_dev_rx_async_submit_packed(dev,
> +				vq, queue_id, pkts, count, comp_pkts,
> +				comp_count);
>  	else
>  		nb_tx = virtio_dev_rx_async_submit_split(dev,
>  				vq, queue_id, pkts, count, comp_pkts,
> --
> 2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: add support for packed ring in async vhost
  2021-03-24  9:19   ` Liu, Yong
@ 2021-03-29 12:29     ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-03-29 12:29 UTC (permalink / raw)
  To: Liu, Yong, maxime.coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan

Hi,

> -----Original Message-----
> From: Liu, Yong <yong.liu@intel.com>
> Sent: Wednesday, March 24, 2021 5:19 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Jiang,
> Cheng1 <cheng1.jiang@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v2] vhost: add support for packed ring in
> async vhost
> 
> 
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Cheng Jiang
> > Sent: Monday, March 22, 2021 2:15 PM
> > To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> > <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Jiang,
> > Cheng1 <cheng1.jiang@intel.com>
> > Subject: [dpdk-dev] [PATCH v2] vhost: add support for packed ring in
> > async vhost
> >
> > For now async vhost data path only supports split ring structure. In
> > order to make async vhost compatible with virtio 1.1 spec this patch
> > enables packed ring in async vhost data path.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> > v2:
> >   * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
> >   * add async_buffers_packed memory free in vhost_free_async_mem()
> >
> >  lib/librte_vhost/rte_vhost_async.h |   1 +
> >  lib/librte_vhost/vhost.c           |  24 +-
> >  lib/librte_vhost/vhost.h           |   7 +-
> >  lib/librte_vhost/virtio_net.c      | 447 +++++++++++++++++++++++++++--
> >  4 files changed, 441 insertions(+), 38 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > b/lib/librte_vhost/rte_vhost_async.h
> > index c855ff875..6faa31f5a 100644
> > --- a/lib/librte_vhost/rte_vhost_async.h
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
> > async_inflight_info {  struct rte_mbuf *mbuf;  uint16_t descs; /* num
> > of descs inflight */
> > +uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >  };
> >
> >  /**
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > 52ab93d1e..51b44d6f2 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -330,15 +330,20 @@ vhost_free_async_mem(struct vhost_virtqueue
> > *vq)
> >  {
> >  if (vq->async_pkts_info)
> >  rte_free(vq->async_pkts_info);
> > -if (vq->async_descs_split)
> > +if (vq->async_buffers_packed) {
> > +rte_free(vq->async_buffers_packed);
> > +vq->async_buffers_packed = NULL;
> > +} else {
> >  rte_free(vq->async_descs_split);
> > +vq->async_descs_split = NULL;
> > +}
> > +
> >  if (vq->it_pool)
> >  rte_free(vq->it_pool);
> >  if (vq->vec_pool)
> >  rte_free(vq->vec_pool);
> >
> >  vq->async_pkts_info = NULL;
> > -vq->async_descs_split = NULL;
> >  vq->it_pool = NULL;
> >  vq->vec_pool = NULL;
> >  }
> > @@ -1603,9 +1608,9 @@ int rte_vhost_async_channel_register(int vid,
> > uint16_t queue_id,  return -1;
> >
> >  /* packed queue is not supported */
> > -if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> > +if (unlikely(!f.async_inorder)) {
> >  VHOST_LOG_CONFIG(ERR,
> > -"async copy is not supported on packed queue or non-inorder mode "
> > +"async copy is not supported on non-inorder mode "
> >  "(vid %d, qid: %d)\n", vid, queue_id);  return -1;  } @@ -1643,10
> > +1648,17 @@ int rte_vhost_async_channel_register(int vid, uint16_t
> > queue_id,  vq->vec_pool = rte_malloc_socket(NULL,
> VHOST_MAX_ASYNC_VEC
> > * sizeof(struct iovec),  RTE_CACHE_LINE_SIZE, node);
> > -vq->async_descs_split = rte_malloc_socket(NULL,
> > +if (vq_is_packed(dev)) {
> > +vq->async_buffers_packed = rte_malloc_socket(NULL, size *
> > +vq->sizeof(struct vring_used_elem_packed),
> > +RTE_CACHE_LINE_SIZE, node);
> > +} else {
> > +vq->async_descs_split = rte_malloc_socket(NULL,
> >  vq->size * sizeof(struct vring_used_elem),  RTE_CACHE_LINE_SIZE,
> > node); -if (!vq->async_descs_split || !vq->async_pkts_info ||
> > +}
> > +
> > +if (!vq->async_pkts_info ||
> >  !vq->it_pool || !vq->vec_pool) {
> >  vhost_free_async_mem(vq);
> >  VHOST_LOG_CONFIG(ERR,
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > 658f6fc28..d6324fbf8 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -206,9 +206,14 @@ struct vhost_virtqueue {  uint16_tasync_pkts_idx;
> > uint16_tasync_pkts_inflight_n;  uint16_tasync_last_pkts_n; -struct
> > vring_used_elem  *async_descs_split;
> > +union {
> > +struct vring_used_elem  *async_descs_split; struct
> > +vring_used_elem_packed *async_buffers_packed; };
> >  uint16_t async_desc_idx;
> > +uint16_t async_packed_buffer_idx;
> >  uint16_t last_async_desc_idx;
> > +uint16_t last_async_buffer_idx;
> >
> >  /* vq async features */
> >  boolasync_inorder;
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 583bf379c..fa8c4f4fe 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -363,8 +363,7 @@
> > vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue
> *vq,
> > }
> >
> >  static __rte_always_inline void
> > -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > -   struct vhost_virtqueue *vq,
> > +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> >     uint32_t len[],
> >     uint16_t id[],
> >     uint16_t count[],
> > @@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct
> > virtio_net *dev,
> >  vq->shadow_aligned_idx += count[i];
> >  vq->shadow_used_idx++;
> >  }
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > +   struct vhost_virtqueue *vq,
> > +   uint32_t len[],
> > +   uint16_t id[],
> > +   uint16_t count[],
> > +   uint16_t num_buffers)
> > +{
> > +vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> >
> >  if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
> > do_data_copy_enqueue(dev, vq); @@ -1633,12 +1643,343 @@
> > virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >  return pkt_idx;
> >  }
> >
> > +static __rte_always_inline int
> > +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> > +    struct vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt,
> > +    struct buf_vector *buf_vec,
> > +    uint16_t *nr_descs,
> > +    uint16_t *nr_buffers,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it,
> > +    struct rte_vhost_iov_iter *dst_it) { uint16_t nr_vec = 0;
> > +uint16_t avail_idx = vq->last_avail_idx; uint16_t max_tries, tries =
> > +0; uint16_t buf_id = 0; uint32_t len = 0; uint16_t desc_count;
> > +uint32_t size = pkt->pkt_len + sizeof(struct
> > virtio_net_hdr_mrg_rxbuf);
> > +uint32_t buffer_len[vq->size];
> > +uint16_t buffer_buf_id[vq->size];
> > +uint16_t buffer_desc_count[vq->size]; *nr_buffers = 0;
> > +
> > +if (rxvq_is_mergeable(dev))
> > +max_tries = vq->size - 1;
> > +else
> > +max_tries = 1;
> > +
> > +while (size > 0) {
> > +/*
> > + * if we tried all available ring items, and still
> > + * can't get enough buf, it means something abnormal
> > + * happened.
> > + */
> > +if (unlikely(++tries > max_tries))
> > +return -1;
> > +
> > +if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count,
> > +buf_vec, &nr_vec, &buf_id, &len,
> > +VHOST_ACCESS_RW) < 0))
> > +return -1;
> > +
> > +len = RTE_MIN(len, size);
> > +size -= len;
> > +
> > +buffer_len[*nr_buffers] = len;
> > +buffer_buf_id[*nr_buffers] = buf_id;
> > +buffer_desc_count[*nr_buffers] = desc_count; *nr_buffers += 1;
> > +
> > +*nr_descs += desc_count;
> > +avail_idx += desc_count;
> > +if (avail_idx >= vq->size)
> > +avail_idx -= vq->size;
> > +}
> > +
> > +if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> > +src_iovec, dst_iovec, src_it, dst_it) < 0) return -1;
> > +
> > +vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> > +   buffer_desc_count, *nr_buffers);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_always_inline int16_t
> > +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> > +    struct vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt,
> > +    uint16_t *nr_descs, uint16_t *nr_buffers,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it,
> > +    struct rte_vhost_iov_iter *dst_it) { struct buf_vector
> > +buf_vec[BUF_VECTOR_MAX]; *nr_descs = 0; *nr_buffers = 0;
> > +
> > +if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> > buf_vec,
> > + nr_descs,
> > + nr_buffers,
> > + src_iovec, dst_iovec,
> > + src_it, dst_it) < 0)) {
> > +VHOST_LOG_DATA(DEBUG,
> > +"(%d) failed to get enough desc from vring\n",
> > +dev->vid);
> > +return -1;
> > +}
> > +
> > +VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> > index %d\n",
> > +dev->vid, vq->last_avail_idx,
> > +vq->last_avail_idx + *nr_descs);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_noinline uint32_t
> > +virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct
> > +vhost_virtqueue *vq, uint16_t queue_id, struct rte_mbuf **pkts,
> > +uint32_t count, struct rte_mbuf **comp_pkts, uint32_t *comp_count) {
> 
> Hi Cheng,
> There're some common parts in virtio_dev_rx_async_submit_packed and
> virtio_dev_rx_async_submit_split.
> We can abstract some functions for those common parts which can bring
> more clarity.

Sure, but the structure or variable used by packed ring and split ring are different, it may not be very suitable for abstraction, I will consider it again, thank you.

> 
> Also this patch may be too huge for reviewing, please separate it into few
> parts for better understanding.

I'll make it better in the next version.

> 
> Thanks,
> Marvin
> 
> > +uint32_t pkt_idx = 0, pkt_burst_idx = 0; uint16_t num_buffers;
> > +uint16_t num_desc;
> > +
> > +struct rte_vhost_iov_iter *it_pool = vq->it_pool; struct iovec
> > +*vec_pool = vq->vec_pool; struct rte_vhost_async_desc
> > +tdes[MAX_PKT_BURST]; struct iovec *src_iovec = vec_pool; struct iovec
> > +*dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1); struct
> > +rte_vhost_iov_iter *src_it = it_pool; struct rte_vhost_iov_iter
> > +*dst_it = it_pool + 1; uint16_t slot_idx = 0; uint16_t segs_await =
> > +0; struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > +uint32_t n_pkts = 0, pkt_err = 0; uint32_t num_async_pkts = 0,
> > +num_done_pkts = 0; struct { uint16_t pkt_idx; uint16_t
> > +last_avail_idx; } async_pkts_log[MAX_PKT_BURST];
> > +
> > +rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
> > +
> > +for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { if
> > +(unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
> > +&num_desc, &num_buffers, src_iovec, dst_iovec, src_it, dst_it) < 0))
> > +{ break; }
> > +
> > +VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> > index %d\n",
> > +dev->vid, vq->last_avail_idx,
> > +vq->last_avail_idx + num_desc);
> > +
> > +slot_idx = (vq->async_pkts_idx + num_async_pkts) & (vq->size - 1); if
> > +(src_it->count) { uint16_t from, to;
> > +
> > +async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> > +pkts_info[slot_idx].descs = num_desc; pkts_info[slot_idx].nr_buffers
> > += num_buffers; pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> > +async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
> > +async_pkts_log[num_async_pkts++].last_avail_idx =
> > +vq->last_avail_idx;
> > +src_iovec += src_it->nr_segs;
> > +dst_iovec += dst_it->nr_segs;
> > +src_it += 2;
> > +dst_it += 2;
> > +segs_await += src_it->nr_segs;
> > +
> > +/**
> > + * recover shadow used ring and keep DMA-occupied
> > + * descriptors.
> > + */
> > +from = vq->shadow_used_idx - num_buffers; to =
> > +vq->async_packed_buffer_idx & (vq->size - 1); if (num_buffers + to <=
> > +vq->size) { rte_memcpy(&vq->async_buffers_packed[to],
> > +&vq->shadow_used_packed[from],
> > +num_buffers *
> > +sizeof(struct
> > vring_used_elem_packed));
> > +} else {
> > +int size = vq->size - to;
> > +
> > +rte_memcpy(&vq->async_buffers_packed[to],
> > +&vq->shadow_used_packed[from],
> > +size *
> > +sizeof(struct
> > vring_used_elem_packed));
> > +rte_memcpy(vq->async_buffers_packed,
> > +&vq->shadow_used_packed[from +
> > +size], (num_buffers - size) *
> > +sizeof(struct
> > vring_used_elem_packed));
> > +}
> > +vq->async_packed_buffer_idx += num_buffers; shadow_used_idx -=
> > +vq->num_buffers;
> > +} else
> > +comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> > +
> > +vq_inc_last_avail_packed(vq, num_desc);
> > +
> > +/*
> > + * conditions to trigger async device transfer:
> > + * - buffered packet number reaches transfer threshold
> > + * - unused async iov number is less than max vhost vector  */ if
> > +(unlikely(pkt_burst_idx >=
> > VHOST_ASYNC_BATCH_THRESHOLD ||
> > +((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> > +BUF_VECTOR_MAX))) {
> > +n_pkts = vq->async_ops.transfer_data(dev->vid,
> > +queue_id, tdes, 0, pkt_burst_idx);
> > +src_iovec = vec_pool;
> > +dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> > 1);
> > +src_it = it_pool;
> > +dst_it = it_pool + 1;
> > +segs_await = 0;
> > +vq->async_pkts_inflight_n += n_pkts;
> > +
> > +if (unlikely(n_pkts < pkt_burst_idx)) {
> > +/*
> > + * log error packets number here and do
> > actual
> > + * error processing when applications poll
> > + * completion
> > + */
> > +pkt_err = pkt_burst_idx - n_pkts;
> > +pkt_burst_idx = 0;
> > +break;
> > +}
> > +
> > +pkt_burst_idx = 0;
> > +}
> > +}
> > +
> > +if (pkt_burst_idx) {
> > +n_pkts = vq->async_ops.transfer_data(dev->vid,
> > +queue_id, tdes, 0, pkt_burst_idx);
> > +vq->async_pkts_inflight_n += n_pkts;
> > +
> > +if (unlikely(n_pkts < pkt_burst_idx)) pkt_err = pkt_burst_idx -
> > +n_pkts; }
> > +
> > +do_data_copy_enqueue(dev, vq);
> > +
> > +if (unlikely(pkt_err)) {
> > +uint16_t num_buffers = 0;
> > +
> > +num_async_pkts -= pkt_err;
> > +/* calculate the sum of descriptors of DMA-error packets. */ while
> > +(pkt_err-- > 0) { num_buffers += pkts_info[slot_idx & (vq->size -
> > +1)].nr_buffers; slot_idx--; }
> > +vq->async_packed_buffer_idx -= num_buffers;
> > +/* recover shadow used ring and available ring */
> > +vq->shadow_used_idx -= (vq->last_avail_idx -
> > +
> > async_pkts_log[num_async_pkts].last_avail_idx -
> > +num_buffers);
> 
> Could it possible that vq->last_avail_idx smaller than
> async_pkts_log[num_async_pkts].last_avail_idx when operations near the
> ring's boundary?

Yes, you are right. Will be fixed, thanks.

Cheng

> 
> > +vq->last_avail_idx =
> > +async_pkts_log[num_async_pkts].last_avail_idx;
> > +pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
> > +num_done_pkts = pkt_idx - num_async_pkts; }
> > +
> > +vq->async_pkts_idx += num_async_pkts;
> > +*comp_count = num_done_pkts;
> > +
> > +if (likely(vq->shadow_used_idx)) {
> > +vhost_flush_enqueue_shadow_packed(dev, vq);
> > +vhost_vring_call_packed(dev, vq); }
> > +
> > +return pkt_idx;
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_update_used_packed(struct virtio_net *dev,
> > +  struct vhost_virtqueue *vq,
> > +  struct vring_used_elem_packed
> > *shadow_ring,
> > +  uint16_t count)
> > +{
> > +if (count == 0)
> > +return;
> > +int i;
> > +uint16_t used_idx = vq->last_used_idx; uint16_t head_idx =
> > +vq->last_used_idx; uint16_t head_flags = 0;
> > +
> > +/* Split loop in two to save memory barriers */ for (i = 0; i <
> > +count; i++) {
> > +vq->desc_packed[used_idx].id = shadow_ring[i].id;
> > +vq->desc_packed[used_idx].len = shadow_ring[i].len;
> > +
> > +used_idx += shadow_ring[i].count;
> > +if (used_idx >= vq->size)
> > +used_idx -= vq->size;
> > +}
> > +
> > +/* The ordering for storing desc flags needs to be enforced. */
> > +rte_atomic_thread_fence(__ATOMIC_RELEASE);
> > +
> > +for (i = 0; i < count; i++) {
> > +uint16_t flags;
> > +
> > +if (vq->shadow_used_packed[i].len)
> > +flags = VRING_DESC_F_WRITE;
> > +else
> > +flags = 0;
> > +
> > +if (vq->used_wrap_counter) {
> > +flags |= VRING_DESC_F_USED;
> > +flags |= VRING_DESC_F_AVAIL;
> > +} else {
> > +flags &= ~VRING_DESC_F_USED;
> > +flags &= ~VRING_DESC_F_AVAIL;
> > +}
> > +
> > +if (i > 0) {
> > +vq->desc_packed[vq->last_used_idx].flags = flags;
> > +
> > +vhost_log_cache_used_vring(dev, vq,
> > +vq->last_used_idx *
> > +sizeof(struct vring_packed_desc),
> > +sizeof(struct vring_packed_desc));
> > +} else {
> > +head_idx = vq->last_used_idx;
> > +head_flags = flags;
> > +}
> > +
> > +vq_inc_last_used_packed(vq, shadow_ring[i].count); }
> > +
> > +vq->desc_packed[head_idx].flags = head_flags;
> > +
> > +vhost_log_cache_used_vring(dev, vq,
> > +head_idx *
> > +sizeof(struct vring_packed_desc),
> > +sizeof(struct vring_packed_desc));
> > +
> > +vhost_log_cache_sync(dev, vq);
> > +}
> > +
> >  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> > struct rte_mbuf **pkts, uint16_t count)  {  struct virtio_net *dev =
> > get_device(vid);  struct vhost_virtqueue *vq; -uint16_t n_pkts_cpl =
> > 0, n_pkts_put = 0, n_descs = 0;
> > +uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
> >  uint16_t start_idx, pkts_idx, vq_size;  struct async_inflight_info
> > *pkts_info;  uint16_t from, i; @@ -1680,53 +2021,96 @@ uint16_t
> > rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,  goto
> > done;  }
> >
> > -for (i = 0; i < n_pkts_put; i++) {
> > -from = (start_idx + i) & (vq_size - 1); -n_descs +=
> > pkts_info[from].descs; -pkts[i] = pkts_info[from].mbuf;
> > +if (vq_is_packed(dev)) {
> > +for (i = 0; i < n_pkts_put; i++) {
> > +from = (start_idx + i) & (vq_size - 1); n_buffers +=
> > +pkts_info[from].nr_buffers; pkts[i] = pkts_info[from].mbuf; } } else
> > +{ for (i = 0; i < n_pkts_put; i++) { from = (start_idx + i) &
> > +(vq_size - 1); n_descs += pkts_info[from].descs; pkts[i] =
> > +pkts_info[from].mbuf; }
> >  }
> > +
> >  vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
> > vq->async_pkts_inflight_n -= n_pkts_put;
> >
> >  if (likely(vq->enabled && vq->access_ok)) { -uint16_t nr_left =
> > n_descs;  uint16_t nr_copy;  uint16_t to;
> >
> >  /* write back completed descriptors to used ring */ -do { -from =
> > vq->last_async_desc_idx & (vq->size - 1); -nr_copy = nr_left + from <=
> > vq->size ? nr_left :
> > -vq->size - from;
> > -to = vq->last_used_idx & (vq->size - 1);
> > -
> > -if (to + nr_copy <= vq->size) {
> > -rte_memcpy(&vq->used->ring[to],
> > +if (vq_is_packed(dev)) {
> > +uint16_t nr_left = n_buffers;
> > +uint16_t to;
> > +do {
> > +from = vq->last_async_buffer_idx &
> > +(vq->size - 1);
> > +to = (from + nr_left) & (vq->size - 1);
> > +
> > +if (to > from) {
> > +vhost_update_used_packed(dev, vq,
> > +vq->async_buffers_packed +
> > from,
> > +to - from);
> > +vq->last_async_buffer_idx += nr_left;
> > +nr_left = 0;
> > +} else {
> > +vhost_update_used_packed(dev, vq,
> > +vq->async_buffers_packed +
> > from,
> > +vq->size - from);
> > +vq->last_async_buffer_idx +=
> > +vq->size -
> > from;
> > +nr_left -= vq->size - from;
> > +}
> > +} while (nr_left > 0);
> > +vhost_vring_call_packed(dev, vq);
> > +} else {
> > +uint16_t nr_left = n_descs;
> > +do {
> > +from = vq->last_async_desc_idx & (vq->size -
> > 1);
> > +nr_copy = nr_left + from <= vq->size ? nr_left :
> > +vq->size - from;
> > +to = vq->last_used_idx & (vq->size - 1);
> > +
> > +if (to + nr_copy <= vq->size) {
> > +rte_memcpy(&vq->used->ring[to],
> >  &vq-
> > >async_descs_split[from],
> >  nr_copy *
> >  sizeof(struct
> > vring_used_elem));
> > -} else {
> > -uint16_t size = vq->size - to;
> > +} else {
> > +uint16_t size = vq->size - to;
> >
> > -rte_memcpy(&vq->used->ring[to],
> > +rte_memcpy(&vq->used->ring[to],
> >  &vq-
> > >async_descs_split[from],
> >  size *
> >  sizeof(struct
> > vring_used_elem));
> > -rte_memcpy(vq->used->ring,
> > +rte_memcpy(vq->used->ring,
> >  &vq->async_descs_split[from
> > +
> >  size], (nr_copy - size) *
> >  sizeof(struct
> > vring_used_elem));
> > -}
> > +}
> > +
> > +vq->last_async_desc_idx += nr_copy;
> > +vq->last_used_idx += nr_copy;
> > +nr_left -= nr_copy;
> > +} while (nr_left > 0);
> > +
> > +__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
> > +vhost_vring_call_split(dev, vq); }
> >
> > -vq->last_async_desc_idx += nr_copy;
> > -vq->last_used_idx += nr_copy;
> > -nr_left -= nr_copy;
> > -} while (nr_left > 0);
> >
> > -__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
> > -vhost_vring_call_split(dev, vq); -} else
> > -vq->last_async_desc_idx += n_descs;
> > +
> > +} else {
> > +if (vq_is_packed(dev))
> > +vq->last_async_buffer_idx += n_buffers;
> > +else
> > +vq->last_async_desc_idx += n_descs;
> > +}
> >
> >  done:
> >  rte_spinlock_unlock(&vq->access_lock);
> > @@ -1767,9 +2151,10 @@ virtio_dev_rx_async_submit(struct virtio_net
> > *dev, uint16_t queue_id,  if (count == 0)  goto out;
> >
> > -/* TODO: packed queue not implemented */  if (vq_is_packed(dev))
> > -nb_tx = 0;
> > +nb_tx = virtio_dev_rx_async_submit_packed(dev,
> > +vq, queue_id, pkts, count, comp_pkts, comp_count);
> >  else
> >  nb_tx = virtio_dev_rx_async_submit_split(dev,
> >  vq, queue_id, pkts, count, comp_pkts,
> > --
> > 2.29.2
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v3] vhost: add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
  2021-03-22  6:15 ` [dpdk-dev] [PATCH v2] " Cheng Jiang
@ 2021-03-31 14:06 ` Cheng Jiang
  2021-04-07  6:26   ` Hu, Jiayu
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-03-31 14:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
enables packed ring in async vhost data path.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
v3:
  * fix error handler for DMA-copy packet
  * remove variables that are no longer needed
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  24 +-
 lib/librte_vhost/vhost.h           |   7 +-
 lib/librte_vhost/virtio_net.c      | 463 +++++++++++++++++++++++++++--
 4 files changed, 457 insertions(+), 38 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };

 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 52ab93d1e..51b44d6f2 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -330,15 +330,20 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
 	if (vq->async_pkts_info)
 		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
+	if (vq->async_buffers_packed) {
+		rte_free(vq->async_buffers_packed);
+		vq->async_buffers_packed = NULL;
+	} else {
 		rte_free(vq->async_descs_split);
+		vq->async_descs_split = NULL;
+	}
+
 	if (vq->it_pool)
 		rte_free(vq->it_pool);
 	if (vq->vec_pool)
 		rte_free(vq->vec_pool);

 	vq->async_pkts_info = NULL;
-	vq->async_descs_split = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -1603,9 +1608,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		return -1;

 	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1643,10 +1648,17 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
+	}
+
+	if (!vq->async_pkts_info ||
 		!vq->it_pool || !vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 658f6fc28..d6324fbf8 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -206,9 +206,14 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
 	uint16_t async_desc_idx;
+	uint16_t async_packed_buffer_idx;
 	uint16_t last_async_desc_idx;
+	uint16_t last_async_buffer_idx;

 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 583bf379c..fa2dfde02 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,8 +363,7 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }

 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 				   uint32_t len[],
 				   uint16_t id[],
 				   uint16_t count[],
@@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t len[],
+				   uint16_t id[],
+				   uint16_t count[],
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);

 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1452,6 +1462,73 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }

+static __rte_always_inline void
+vhost_update_used_packed(struct virtio_net *dev,
+				  struct vhost_virtqueue *vq,
+				  struct vring_used_elem_packed *shadow_ring,
+				  uint16_t count)
+{
+	if (count == 0)
+		return;
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+			vhost_log_cache_used_vring(dev, vq,
+					vq->last_used_idx *
+					sizeof(struct vring_packed_desc),
+					sizeof(struct vring_packed_desc));
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
+	vhost_log_cache_sync(dev, vq);
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1633,12 +1710,292 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }

+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq,
+						avail_idx, &desc_count,
+						buf_vec, &nr_vec,
+						&buf_id, &len,
+						VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
+		src_iovec, dst_iovec, src_it, dst_it) < 0)
+		return -1;
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					   buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec,
+						 nr_descs,
+						 nr_buffers,
+						 src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG,
+				"(%d) failed to get enough desc from vring\n",
+				dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_iov_iter *src_it = it_pool;
+	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+
+	rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
+						pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						src_iovec, dst_iovec,
+						src_it, dst_it) < 0)) {
+			break;
+		}
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
+			(vq->size - 1);
+		if (src_it->count) {
+			uint16_t from, to;
+
+			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			src_iovec += src_it->nr_segs;
+			dst_iovec += dst_it->nr_segs;
+			src_it += 2;
+			dst_it += 2;
+			segs_await += src_it->nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx & (vq->size - 1);
+			if (num_buffers + to <= vq->size) {
+				rte_memcpy(&vq->async_buffers_packed[to],
+					&vq->shadow_used_packed[from],
+					num_buffers *
+					sizeof(struct vring_used_elem_packed));
+			} else {
+				int size = vq->size - to;
+
+				rte_memcpy(&vq->async_buffers_packed[to],
+					&vq->shadow_used_packed[from],
+					size *
+					sizeof(struct vring_used_elem_packed));
+				rte_memcpy(vq->async_buffers_packed,
+					&vq->shadow_used_packed[from +
+					size], (num_buffers - size) *
+					sizeof(struct vring_used_elem_packed));
+			}
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
+			BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, pkt_burst_idx);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			src_it = it_pool;
+			dst_it = it_pool + 1;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid,
+				queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t buffers_err = 0;
+		uint16_t async_buffer_idx;
+		uint16_t i;
+
+		num_async_pkts -= pkt_err;
+		pkt_idx -= pkt_err;
+		/* calculate the sum of buffers of DMA-error packets. */
+		while (pkt_err-- > 0) {
+			buffers_err +=
+				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
+			slot_idx--;
+		}
+
+		vq->async_packed_buffer_idx -= buffers_err;
+		async_buffer_idx = vq->async_packed_buffer_idx;
+		/* set 0 to the length of descriptors of DMA-error packets */
+		for (i = 0; i < buffers_err; i++) {
+			vq->async_buffers_packed[(async_buffer_idx + i)
+						& (vq->size - 1)].len = 0;
+		}
+		/* write back DMA-error descriptors to used ring */
+		do {
+			uint16_t from = async_buffer_idx & (vq->size - 1);
+			uint16_t to = (from + buffers_err) & (vq->size - 1);
+
+			if (to > from) {
+				vhost_update_used_packed(dev, vq,
+					vq->async_buffers_packed + from,
+					to - from);
+				buffers_err = 0;
+			} else {
+				vhost_update_used_packed(dev, vq,
+					vq->async_buffers_packed + from,
+					vq->size - from);
+				buffers_err -= vq->size - from;
+			}
+		} while (buffers_err > 0);
+		vhost_vring_call_packed(dev, vq);
+		num_done_pkts = pkt_idx - num_async_pkts;
+	}
+
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1680,53 +2037,96 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}

-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;

 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
 		uint16_t nr_copy;
 		uint16_t to;

 		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
+		if (vq_is_packed(dev)) {
+			uint16_t nr_left = n_buffers;
+			uint16_t to;
+			do {
+				from = vq->last_async_buffer_idx &
+								(vq->size - 1);
+				to = (from + nr_left) & (vq->size - 1);
+
+				if (to > from) {
+					vhost_update_used_packed(dev, vq,
+						vq->async_buffers_packed + from,
+						to - from);
+					vq->last_async_buffer_idx += nr_left;
+					nr_left = 0;
+				} else {
+					vhost_update_used_packed(dev, vq,
+						vq->async_buffers_packed + from,
+						vq->size - from);
+					vq->last_async_buffer_idx +=
+								vq->size - from;
+					nr_left -= vq->size - from;
+				}
+			} while (nr_left > 0);
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			uint16_t nr_left = n_descs;
+			do {
+				from = vq->last_async_desc_idx & (vq->size - 1);
+				nr_copy = nr_left + from <= vq->size ? nr_left :
+					vq->size - from;
+				to = vq->last_used_idx & (vq->size - 1);
+
+				if (to + nr_copy <= vq->size) {
+					rte_memcpy(&vq->used->ring[to],
 						&vq->async_descs_split[from],
 						nr_copy *
 						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
+				} else {
+					uint16_t size = vq->size - to;

-				rte_memcpy(&vq->used->ring[to],
+					rte_memcpy(&vq->used->ring[to],
 						&vq->async_descs_split[from],
 						size *
 						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
+					rte_memcpy(vq->used->ring,
 						&vq->async_descs_split[from +
 						size], (nr_copy - size) *
 						sizeof(struct vring_used_elem));
-			}
+				}
+
+				vq->last_async_desc_idx += nr_copy;
+				vq->last_used_idx += nr_copy;
+				nr_left -= nr_copy;
+			} while (nr_left > 0);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}

-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);

-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}

 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1767,9 +2167,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;

-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: add support for packed ring in async vhost
  2021-03-31 14:06 ` [dpdk-dev] [PATCH v3] " Cheng Jiang
@ 2021-04-07  6:26   ` Hu, Jiayu
  2021-04-08 12:01     ` Jiang, Cheng1
  0 siblings, 1 reply; 60+ messages in thread
From: Hu, Jiayu @ 2021-04-07  6:26 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan

Hi Cheng,

Some comments are inline.

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Wednesday, March 31, 2021 10:06 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Jiang,
> Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v3] vhost: add support for packed ring in async vhost
> 
> For now async vhost data path only supports split ring structure. In
> order to make async vhost compatible with virtio 1.1 spec this patch
> enables packed ring in async vhost data path.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
> v3:
>   * fix error handler for DMA-copy packet
>   * remove variables that are no longer needed
> v2:
>   * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
>   * add async_buffers_packed memory free in vhost_free_async_mem()
> 
>  lib/librte_vhost/rte_vhost_async.h |   1 +
>  lib/librte_vhost/vhost.c           |  24 +-
>  lib/librte_vhost/vhost.h           |   7 +-
>  lib/librte_vhost/virtio_net.c      | 463 +++++++++++++++++++++++++++--
>  4 files changed, 457 insertions(+), 38 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost_async.h
> b/lib/librte_vhost/rte_vhost_async.h
> index c855ff875..6faa31f5a 100644
> --- a/lib/librte_vhost/rte_vhost_async.h
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
>  struct async_inflight_info {
>  	struct rte_mbuf *mbuf;
>  	uint16_t descs; /* num of descs inflight */
> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>  };
> 
>  /**
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 52ab93d1e..51b44d6f2 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -330,15 +330,20 @@ vhost_free_async_mem(struct vhost_virtqueue
> *vq)
>  {
>  	if (vq->async_pkts_info)
>  		rte_free(vq->async_pkts_info);
> -	if (vq->async_descs_split)
> +	if (vq->async_buffers_packed) {
> +		rte_free(vq->async_buffers_packed);
> +		vq->async_buffers_packed = NULL;
> +	} else {
>  		rte_free(vq->async_descs_split);
> +		vq->async_descs_split = NULL;
> +	}
> +
>  	if (vq->it_pool)
>  		rte_free(vq->it_pool);
>  	if (vq->vec_pool)
>  		rte_free(vq->vec_pool);
> 
>  	vq->async_pkts_info = NULL;
> -	vq->async_descs_split = NULL;
>  	vq->it_pool = NULL;
>  	vq->vec_pool = NULL;
>  }
> @@ -1603,9 +1608,9 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
>  		return -1;
> 
>  	/* packed queue is not supported */
> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> +	if (unlikely(!f.async_inorder)) {
>  		VHOST_LOG_CONFIG(ERR,
> -			"async copy is not supported on packed queue or
> non-inorder mode "
> +			"async copy is not supported on non-inorder mode "
>  			"(vid %d, qid: %d)\n", vid, queue_id);
>  		return -1;
>  	}
> @@ -1643,10 +1648,17 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
>  	vq->vec_pool = rte_malloc_socket(NULL,
>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
>  			RTE_CACHE_LINE_SIZE, node);
> -	vq->async_descs_split = rte_malloc_socket(NULL,
> +	if (vq_is_packed(dev)) {
> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> +			vq->size * sizeof(struct vring_used_elem_packed),
> +			RTE_CACHE_LINE_SIZE, node);
> +	} else {
> +		vq->async_descs_split = rte_malloc_socket(NULL,
>  			vq->size * sizeof(struct vring_used_elem),
>  			RTE_CACHE_LINE_SIZE, node);
> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> +	}
> +
> +	if (!vq->async_pkts_info ||

Need to check if malloc fails for async_buffers_packed.

>  		!vq->it_pool || !vq->vec_pool) {
>  		vhost_free_async_mem(vq);
>  		VHOST_LOG_CONFIG(ERR,
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 658f6fc28..d6324fbf8 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -206,9 +206,14 @@ struct vhost_virtqueue {
>  	uint16_t	async_pkts_idx;
>  	uint16_t	async_pkts_inflight_n;
>  	uint16_t	async_last_pkts_n;
> -	struct vring_used_elem  *async_descs_split;
> +	union {
> +		struct vring_used_elem  *async_descs_split;
> +		struct vring_used_elem_packed *async_buffers_packed;
> +	};
>  	uint16_t async_desc_idx;
> +	uint16_t async_packed_buffer_idx;
>  	uint16_t last_async_desc_idx;
> +	uint16_t last_async_buffer_idx;
> 
>  	/* vq async features */
>  	bool		async_inorder;
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 583bf379c..fa2dfde02 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -363,8 +363,7 @@
> vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
>  }
> 
>  static __rte_always_inline void
> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> -				   struct vhost_virtqueue *vq,
> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
>  				   uint32_t len[],
>  				   uint16_t id[],
>  				   uint16_t count[],
> @@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct
> virtio_net *dev,
>  		vq->shadow_aligned_idx += count[i];
>  		vq->shadow_used_idx++;
>  	}
> +}
> +
> +static __rte_always_inline void
> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> +				   struct vhost_virtqueue *vq,
> +				   uint32_t len[],
> +				   uint16_t id[],
> +				   uint16_t count[],
> +				   uint16_t num_buffers)
> +{
> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> 
>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
>  		do_data_copy_enqueue(dev, vq);
> @@ -1452,6 +1462,73 @@ virtio_dev_rx_async_get_info_idx(uint16_t
> pkts_idx,
>  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
>  }
> 
> +static __rte_always_inline void
> +vhost_update_used_packed(struct virtio_net *dev,
> +				  struct vhost_virtqueue *vq,
> +				  struct vring_used_elem_packed
> *shadow_ring,
> +				  uint16_t count)
> +{
> +	if (count == 0)
> +		return;
> +	int i;
> +	uint16_t used_idx = vq->last_used_idx;
> +	uint16_t head_idx = vq->last_used_idx;
> +	uint16_t head_flags = 0;
> +
> +	/* Split loop in two to save memory barriers */
> +	for (i = 0; i < count; i++) {
> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> +
> +		used_idx += shadow_ring[i].count;
> +		if (used_idx >= vq->size)
> +			used_idx -= vq->size;
> +	}
> +
> +	/* The ordering for storing desc flags needs to be enforced. */
> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> +
> +	for (i = 0; i < count; i++) {
> +		uint16_t flags;
> +
> +		if (vq->shadow_used_packed[i].len)
> +			flags = VRING_DESC_F_WRITE;
> +		else
> +			flags = 0;
> +
> +		if (vq->used_wrap_counter) {
> +			flags |= VRING_DESC_F_USED;
> +			flags |= VRING_DESC_F_AVAIL;
> +		} else {
> +			flags &= ~VRING_DESC_F_USED;
> +			flags &= ~VRING_DESC_F_AVAIL;
> +		}
> +
> +		if (i > 0) {
> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> +
> +			vhost_log_cache_used_vring(dev, vq,
> +					vq->last_used_idx *
> +					sizeof(struct vring_packed_desc),
> +					sizeof(struct vring_packed_desc));
> +		} else {
> +			head_idx = vq->last_used_idx;
> +			head_flags = flags;
> +		}
> +
> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> +	}
> +
> +	vq->desc_packed[head_idx].flags = head_flags;
> +
> +	vhost_log_cache_used_vring(dev, vq,
> +				head_idx *
> +				sizeof(struct vring_packed_desc),
> +				sizeof(struct vring_packed_desc));
> +
> +	vhost_log_cache_sync(dev, vq);

Async enqueue of packed ring has no support of live migration.
The above code is not needed.

> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1633,12 +1710,292 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	return pkt_idx;
>  }
> 
> +static __rte_always_inline int
> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    struct buf_vector *buf_vec,
> +			    uint16_t *nr_descs,
> +			    uint16_t *nr_buffers,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	uint16_t nr_vec = 0;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint16_t max_tries, tries = 0;
> +	uint16_t buf_id = 0;
> +	uint32_t len = 0;
> +	uint16_t desc_count;
> +	uint32_t size = pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> +	uint32_t buffer_len[vq->size];
> +	uint16_t buffer_buf_id[vq->size];
> +	uint16_t buffer_desc_count[vq->size];
> +	*nr_buffers = 0;
> +
> +	if (rxvq_is_mergeable(dev))
> +		max_tries = vq->size - 1;
> +	else
> +		max_tries = 1;
> +
> +	while (size > 0) {
> +		/*
> +		 * if we tried all available ring items, and still
> +		 * can't get enough buf, it means something abnormal
> +		 * happened.
> +		 */
> +		if (unlikely(++tries > max_tries))
> +			return -1;
> +
> +		if (unlikely(fill_vec_buf_packed(dev, vq,
> +						avail_idx, &desc_count,
> +						buf_vec, &nr_vec,
> +						&buf_id, &len,
> +						VHOST_ACCESS_RW) < 0))
> +			return -1;
> +
> +		len = RTE_MIN(len, size);
> +		size -= len;
> +
> +		buffer_len[*nr_buffers] = len;
> +		buffer_buf_id[*nr_buffers] = buf_id;
> +		buffer_desc_count[*nr_buffers] = desc_count;
> +		*nr_buffers += 1;
> +
> +		*nr_descs += desc_count;
> +		avail_idx += desc_count;
> +		if (avail_idx >= vq->size)
> +			avail_idx -= vq->size;
> +	}
> +
> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> +		src_iovec, dst_iovec, src_it, dst_it) < 0)
> +		return -1;
> +
> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> +					   buffer_desc_count, *nr_buffers);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline int16_t
> +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    uint16_t *nr_descs, uint16_t *nr_buffers,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> +	*nr_descs = 0;
> +	*nr_buffers = 0;
> +
> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> buf_vec,
> +						 nr_descs,
> +						 nr_buffers,
> +						 src_iovec, dst_iovec,
> +						 src_it, dst_it) < 0)) {
> +		VHOST_LOG_DATA(DEBUG,
> +				"(%d) failed to get enough desc from vring\n",
> +				dev->vid);
> +		return -1;
> +	}
> +
> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> +			dev->vid, vq->last_avail_idx,
> +			vq->last_avail_idx + *nr_descs);
> +
> +	return 0;
> +}
> +
> +static __rte_noinline uint32_t
> +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
> +	struct vhost_virtqueue *vq, uint16_t queue_id,
> +	struct rte_mbuf **pkts, uint32_t count,
> +	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
> +{
> +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> +	uint16_t num_buffers;
> +	uint16_t num_desc;
> +
> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> +	struct iovec *vec_pool = vq->vec_pool;
> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> +	struct iovec *src_iovec = vec_pool;
> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> +	struct rte_vhost_iov_iter *src_it = it_pool;
> +	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> +	uint16_t slot_idx = 0;
> +	uint16_t segs_await = 0;
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> +	uint32_t n_pkts = 0, pkt_err = 0;
> +	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> +
> +	rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
> +
> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> +		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
> +						pkts[pkt_idx],
> +						&num_desc, &num_buffers,
> +						src_iovec, dst_iovec,
> +						src_it, dst_it) < 0)) {
> +			break;
> +		}
> +
> +		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> +			dev->vid, vq->last_avail_idx,
> +			vq->last_avail_idx + num_desc);
> +
> +		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
> +			(vq->size - 1);
> +		if (src_it->count) {
> +			uint16_t from, to;
> +
> +			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> +			pkts_info[slot_idx].descs = num_desc;
> +			pkts_info[slot_idx].nr_buffers = num_buffers;
> +			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> +			num_async_pkts++;
> +			src_iovec += src_it->nr_segs;
> +			dst_iovec += dst_it->nr_segs;
> +			src_it += 2;
> +			dst_it += 2;
> +			segs_await += src_it->nr_segs;
> +
> +			/**
> +			 * recover shadow used ring and keep DMA-occupied
> +			 * descriptors.
> +			 */
> +			from = vq->shadow_used_idx - num_buffers;
> +			to = vq->async_packed_buffer_idx & (vq->size - 1);
> +			if (num_buffers + to <= vq->size) {
> +				rte_memcpy(&vq->async_buffers_packed[to],
> +					&vq->shadow_used_packed[from],
> +					num_buffers *
> +					sizeof(struct
> vring_used_elem_packed));
> +			} else {
> +				int size = vq->size - to;
> +
> +				rte_memcpy(&vq->async_buffers_packed[to],
> +					&vq->shadow_used_packed[from],
> +					size *
> +					sizeof(struct
> vring_used_elem_packed));
> +				rte_memcpy(vq->async_buffers_packed,
> +					&vq->shadow_used_packed[from +
> +					size], (num_buffers - size) *
> +					sizeof(struct
> vring_used_elem_packed));
> +			}
> +			vq->async_packed_buffer_idx += num_buffers;
> +			vq->shadow_used_idx -= num_buffers;
> +		} else
> +			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> +
> +		vq_inc_last_avail_packed(vq, num_desc);
> +
> +		/*
> +		 * conditions to trigger async device transfer:
> +		 * - buffered packet number reaches transfer threshold
> +		 * - unused async iov number is less than max vhost vector
> +		 */
> +		if (unlikely(pkt_burst_idx >=
> VHOST_ASYNC_BATCH_THRESHOLD ||
> +			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> +			BUF_VECTOR_MAX))) {
> +			n_pkts = vq->async_ops.transfer_data(dev->vid,
> +					queue_id, tdes, 0, pkt_burst_idx);
> +			src_iovec = vec_pool;
> +			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> +			src_it = it_pool;
> +			dst_it = it_pool + 1;
> +			segs_await = 0;
> +			vq->async_pkts_inflight_n += n_pkts;
> +
> +			if (unlikely(n_pkts < pkt_burst_idx)) {
> +				/*
> +				 * log error packets number here and do
> actual
> +				 * error processing when applications poll
> +				 * completion
> +				 */
> +				pkt_err = pkt_burst_idx - n_pkts;
> +				pkt_burst_idx = 0;
> +				pkt_idx++;
> +				break;
> +			}
> +
> +			pkt_burst_idx = 0;
> +		}
> +	}
> +
> +	if (pkt_burst_idx) {
> +		n_pkts = vq->async_ops.transfer_data(dev->vid,
> +				queue_id, tdes, 0, pkt_burst_idx);
> +		vq->async_pkts_inflight_n += n_pkts;
> +
> +		if (unlikely(n_pkts < pkt_burst_idx))
> +			pkt_err = pkt_burst_idx - n_pkts;
> +	}
> +
> +	do_data_copy_enqueue(dev, vq);
> +
> +	if (unlikely(pkt_err)) {
> +		uint16_t buffers_err = 0;
> +		uint16_t async_buffer_idx;
> +		uint16_t i;
> +
> +		num_async_pkts -= pkt_err;
> +		pkt_idx -= pkt_err;
> +		/* calculate the sum of buffers of DMA-error packets. */
> +		while (pkt_err-- > 0) {
> +			buffers_err +=
> +				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
> +			slot_idx--;
> +		}
> +
> +		vq->async_packed_buffer_idx -= buffers_err;
> +		async_buffer_idx = vq->async_packed_buffer_idx;
> +		/* set 0 to the length of descriptors of DMA-error packets */
> +		for (i = 0; i < buffers_err; i++) {
> +			vq->async_buffers_packed[(async_buffer_idx + i)
> +						& (vq->size - 1)].len = 0;
> +		}
> +		/* write back DMA-error descriptors to used ring */
> +		do {
> +			uint16_t from = async_buffer_idx & (vq->size - 1);
> +			uint16_t to = (from + buffers_err) & (vq->size - 1);
> +
> +			if (to > from) {
> +				vhost_update_used_packed(dev, vq,
> +					vq->async_buffers_packed + from,
> +					to - from);
> +				buffers_err = 0;
> +			} else {
> +				vhost_update_used_packed(dev, vq,
> +					vq->async_buffers_packed + from,
> +					vq->size - from);
> +				buffers_err -= vq->size - from;
> +			}
> +		} while (buffers_err > 0);
> +		vhost_vring_call_packed(dev, vq);

Why notify front-end here?

> +		num_done_pkts = pkt_idx - num_async_pkts;
> +	}
> +
> +	vq->async_pkts_idx += num_async_pkts;
> +	*comp_count = num_done_pkts;
> +
> +	if (likely(vq->shadow_used_idx)) {
> +		vhost_flush_enqueue_shadow_packed(dev, vq);
> +		vhost_vring_call_packed(dev, vq);
> +	}
> +
> +	return pkt_idx;
> +}

virtio_dev_rx_async_submit_packed is too long and it has several parts are
similar with split ring. I think you need to abstract common parts into inline
functions to make the code easier to read.

> +
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count)
>  {
>  	struct virtio_net *dev = get_device(vid);
>  	struct vhost_virtqueue *vq;
> -	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
>  	uint16_t start_idx, pkts_idx, vq_size;
>  	struct async_inflight_info *pkts_info;
>  	uint16_t from, i;
> @@ -1680,53 +2037,96 @@ uint16_t
> rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		goto done;
>  	}
> 
> -	for (i = 0; i < n_pkts_put; i++) {
> -		from = (start_idx + i) & (vq_size - 1);
> -		n_descs += pkts_info[from].descs;
> -		pkts[i] = pkts_info[from].mbuf;
> +	if (vq_is_packed(dev)) {
> +		for (i = 0; i < n_pkts_put; i++) {
> +			from = (start_idx + i) & (vq_size - 1);
> +			n_buffers += pkts_info[from].nr_buffers;
> +			pkts[i] = pkts_info[from].mbuf;
> +		}
> +	} else {
> +		for (i = 0; i < n_pkts_put; i++) {
> +			from = (start_idx + i) & (vq_size - 1);
> +			n_descs += pkts_info[from].descs;
> +			pkts[i] = pkts_info[from].mbuf;
> +		}
>  	}
> +
>  	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
>  	vq->async_pkts_inflight_n -= n_pkts_put;
> 
>  	if (likely(vq->enabled && vq->access_ok)) {
> -		uint16_t nr_left = n_descs;
>  		uint16_t nr_copy;
>  		uint16_t to;
> 
>  		/* write back completed descriptors to used ring */
> -		do {
> -			from = vq->last_async_desc_idx & (vq->size - 1);
> -			nr_copy = nr_left + from <= vq->size ? nr_left :
> -				vq->size - from;
> -			to = vq->last_used_idx & (vq->size - 1);
> -
> -			if (to + nr_copy <= vq->size) {
> -				rte_memcpy(&vq->used->ring[to],
> +		if (vq_is_packed(dev)) {
> +			uint16_t nr_left = n_buffers;
> +			uint16_t to;
> +			do {
> +				from = vq->last_async_buffer_idx &
> +								(vq->size - 1);
> +				to = (from + nr_left) & (vq->size - 1);
> +
> +				if (to > from) {
> +					vhost_update_used_packed(dev, vq,
> +						vq->async_buffers_packed +
> from,
> +						to - from);
> +					vq->last_async_buffer_idx += nr_left;
> +					nr_left = 0;
> +				} else {
> +					vhost_update_used_packed(dev, vq,
> +						vq->async_buffers_packed +
> from,
> +						vq->size - from);
> +					vq->last_async_buffer_idx +=
> +								vq->size -
> from;
> +					nr_left -= vq->size - from;
> +				}
> +			} while (nr_left > 0);
> +			vhost_vring_call_packed(dev, vq);
> +		} else {
> +			uint16_t nr_left = n_descs;
> +			do {
> +				from = vq->last_async_desc_idx & (vq->size -
> 1);
> +				nr_copy = nr_left + from <= vq->size ? nr_left :
> +					vq->size - from;
> +				to = vq->last_used_idx & (vq->size - 1);
> +
> +				if (to + nr_copy <= vq->size) {
> +					rte_memcpy(&vq->used->ring[to],
>  						&vq-
> >async_descs_split[from],
>  						nr_copy *
>  						sizeof(struct
> vring_used_elem));
> -			} else {
> -				uint16_t size = vq->size - to;
> +				} else {
> +					uint16_t size = vq->size - to;
> 
> -				rte_memcpy(&vq->used->ring[to],
> +					rte_memcpy(&vq->used->ring[to],
>  						&vq-
> >async_descs_split[from],
>  						size *
>  						sizeof(struct
> vring_used_elem));
> -				rte_memcpy(vq->used->ring,
> +					rte_memcpy(vq->used->ring,
>  						&vq->async_descs_split[from
> +
>  						size], (nr_copy - size) *
>  						sizeof(struct
> vring_used_elem));
> -			}
> +				}
> +
> +				vq->last_async_desc_idx += nr_copy;
> +				vq->last_used_idx += nr_copy;
> +				nr_left -= nr_copy;
> +			} while (nr_left > 0);
> +
> +			__atomic_add_fetch(&vq->used->idx, n_descs,
> +					__ATOMIC_RELEASE);
> +			vhost_vring_call_split(dev, vq);
> +		}
> 
> -			vq->last_async_desc_idx += nr_copy;
> -			vq->last_used_idx += nr_copy;
> -			nr_left -= nr_copy;
> -		} while (nr_left > 0);
> 
> -		__atomic_add_fetch(&vq->used->idx, n_descs,
> __ATOMIC_RELEASE);
> -		vhost_vring_call_split(dev, vq);
> -	} else
> -		vq->last_async_desc_idx += n_descs;
> +
> +	} else {
> +		if (vq_is_packed(dev))
> +			vq->last_async_buffer_idx += n_buffers;
> +		else
> +			vq->last_async_desc_idx += n_descs;
> +	}

rte_vhost_poll_enqueue_completed is too long and not easy to read. Save suggestion
as above.

Thanks,
Jiayu

> 
>  done:
>  	rte_spinlock_unlock(&vq->access_lock);
> @@ -1767,9 +2167,10 @@ virtio_dev_rx_async_submit(struct virtio_net
> *dev, uint16_t queue_id,
>  	if (count == 0)
>  		goto out;
> 
> -	/* TODO: packed queue not implemented */
>  	if (vq_is_packed(dev))
> -		nb_tx = 0;
> +		nb_tx = virtio_dev_rx_async_submit_packed(dev,
> +				vq, queue_id, pkts, count, comp_pkts,
> +				comp_count);
>  	else
>  		nb_tx = virtio_dev_rx_async_submit_split(dev,
>  				vq, queue_id, pkts, count, comp_pkts,
> --
> 2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: add support for packed ring in async vhost
  2021-04-07  6:26   ` Hu, Jiayu
@ 2021-04-08 12:01     ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-08 12:01 UTC (permalink / raw)
  To: Hu, Jiayu, maxime.coquelin, Xia, Chenbo; +Cc: dev, Yang, YvonneX, Wang, Yinan

Hi Jiayu,

> -----Original Message-----
> From: Hu, Jiayu <jiayu.hu@intel.com>
> Sent: Wednesday, April 7, 2021 2:27 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Yang, YvonneX <yvonnex.yang@intel.com>; Wang, Yinan
> <yinan.wang@intel.com>
> Subject: RE: [PATCH v3] vhost: add support for packed ring in async vhost
> 
> Hi Cheng,
> 
> Some comments are inline.
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Wednesday, March 31, 2021 10:06 PM
> > To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> > <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Jiang,
> > Cheng1 <cheng1.jiang@intel.com>
> > Subject: [PATCH v3] vhost: add support for packed ring in async vhost
> >
> > For now async vhost data path only supports split ring structure. In
> > order to make async vhost compatible with virtio 1.1 spec this patch
> > enables packed ring in async vhost data path.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> > v3:
> >   * fix error handler for DMA-copy packet
> >   * remove variables that are no longer needed
> > v2:
> >   * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
> >   * add async_buffers_packed memory free in vhost_free_async_mem()
> >
> >  lib/librte_vhost/rte_vhost_async.h |   1 +
> >  lib/librte_vhost/vhost.c           |  24 +-
> >  lib/librte_vhost/vhost.h           |   7 +-
> >  lib/librte_vhost/virtio_net.c      | 463 +++++++++++++++++++++++++++--
> >  4 files changed, 457 insertions(+), 38 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > b/lib/librte_vhost/rte_vhost_async.h
> > index c855ff875..6faa31f5a 100644
> > --- a/lib/librte_vhost/rte_vhost_async.h
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
> > async_inflight_info {  struct rte_mbuf *mbuf;  uint16_t descs; /* num
> > of descs inflight */
> > +uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >  };
> >
> >  /**
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > 52ab93d1e..51b44d6f2 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -330,15 +330,20 @@ vhost_free_async_mem(struct vhost_virtqueue
> > *vq)
> >  {
> >  if (vq->async_pkts_info)
> >  rte_free(vq->async_pkts_info);
> > -if (vq->async_descs_split)
> > +if (vq->async_buffers_packed) {
> > +rte_free(vq->async_buffers_packed);
> > +vq->async_buffers_packed = NULL;
> > +} else {
> >  rte_free(vq->async_descs_split);
> > +vq->async_descs_split = NULL;
> > +}
> > +
> >  if (vq->it_pool)
> >  rte_free(vq->it_pool);
> >  if (vq->vec_pool)
> >  rte_free(vq->vec_pool);
> >
> >  vq->async_pkts_info = NULL;
> > -vq->async_descs_split = NULL;
> >  vq->it_pool = NULL;
> >  vq->vec_pool = NULL;
> >  }
> > @@ -1603,9 +1608,9 @@ int rte_vhost_async_channel_register(int vid,
> > uint16_t queue_id,  return -1;
> >
> >  /* packed queue is not supported */
> > -if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> > +if (unlikely(!f.async_inorder)) {
> >  VHOST_LOG_CONFIG(ERR,
> > -"async copy is not supported on packed queue or non-inorder mode "
> > +"async copy is not supported on non-inorder mode "
> >  "(vid %d, qid: %d)\n", vid, queue_id);  return -1;  } @@ -1643,10
> > +1648,17 @@ int rte_vhost_async_channel_register(int vid, uint16_t
> > queue_id,  vq->vec_pool = rte_malloc_socket(NULL,
> VHOST_MAX_ASYNC_VEC
> > * sizeof(struct iovec),  RTE_CACHE_LINE_SIZE, node);
> > -vq->async_descs_split = rte_malloc_socket(NULL,
> > +if (vq_is_packed(dev)) {
> > +vq->async_buffers_packed = rte_malloc_socket(NULL, size *
> > +vq->sizeof(struct vring_used_elem_packed),
> > +RTE_CACHE_LINE_SIZE, node);
> > +} else {
> > +vq->async_descs_split = rte_malloc_socket(NULL,
> >  vq->size * sizeof(struct vring_used_elem),  RTE_CACHE_LINE_SIZE,
> > node); -if (!vq->async_descs_split || !vq->async_pkts_info ||
> > +}
> > +
> > +if (!vq->async_pkts_info ||
> 
> Need to check if malloc fails for async_buffers_packed.

Sure, It will be fixed in the next version.

> 
> >  !vq->it_pool || !vq->vec_pool) {
> >  vhost_free_async_mem(vq);
> >  VHOST_LOG_CONFIG(ERR,
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > 658f6fc28..d6324fbf8 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -206,9 +206,14 @@ struct vhost_virtqueue {  uint16_tasync_pkts_idx;
> > uint16_tasync_pkts_inflight_n;  uint16_tasync_last_pkts_n; -struct
> > vring_used_elem  *async_descs_split;
> > +union {
> > +struct vring_used_elem  *async_descs_split; struct
> > +vring_used_elem_packed *async_buffers_packed; };
> >  uint16_t async_desc_idx;
> > +uint16_t async_packed_buffer_idx;
> >  uint16_t last_async_desc_idx;
> > +uint16_t last_async_buffer_idx;
> >
> >  /* vq async features */
> >  boolasync_inorder;
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 583bf379c..fa2dfde02 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -363,8 +363,7 @@
> > vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue
> *vq,
> > }
> >
> >  static __rte_always_inline void
> > -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > -   struct vhost_virtqueue *vq,
> > +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> >     uint32_t len[],
> >     uint16_t id[],
> >     uint16_t count[],
> > @@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct
> > virtio_net *dev,
> >  vq->shadow_aligned_idx += count[i];
> >  vq->shadow_used_idx++;
> >  }
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > +   struct vhost_virtqueue *vq,
> > +   uint32_t len[],
> > +   uint16_t id[],
> > +   uint16_t count[],
> > +   uint16_t num_buffers)
> > +{
> > +vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> >
> >  if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
> > do_data_copy_enqueue(dev, vq); @@ -1452,6 +1462,73 @@
> > virtio_dev_rx_async_get_info_idx(uint16_t
> > pkts_idx,
> >  (vq_size - n_inflight + pkts_idx) & (vq_size - 1);  }
> >
> > +static __rte_always_inline void
> > +vhost_update_used_packed(struct virtio_net *dev,
> > +  struct vhost_virtqueue *vq,
> > +  struct vring_used_elem_packed
> > *shadow_ring,
> > +  uint16_t count)
> > +{
> > +if (count == 0)
> > +return;
> > +int i;
> > +uint16_t used_idx = vq->last_used_idx; uint16_t head_idx =
> > +vq->last_used_idx; uint16_t head_flags = 0;
> > +
> > +/* Split loop in two to save memory barriers */ for (i = 0; i <
> > +count; i++) {
> > +vq->desc_packed[used_idx].id = shadow_ring[i].id;
> > +vq->desc_packed[used_idx].len = shadow_ring[i].len;
> > +
> > +used_idx += shadow_ring[i].count;
> > +if (used_idx >= vq->size)
> > +used_idx -= vq->size;
> > +}
> > +
> > +/* The ordering for storing desc flags needs to be enforced. */
> > +rte_atomic_thread_fence(__ATOMIC_RELEASE);
> > +
> > +for (i = 0; i < count; i++) {
> > +uint16_t flags;
> > +
> > +if (vq->shadow_used_packed[i].len)
> > +flags = VRING_DESC_F_WRITE;
> > +else
> > +flags = 0;
> > +
> > +if (vq->used_wrap_counter) {
> > +flags |= VRING_DESC_F_USED;
> > +flags |= VRING_DESC_F_AVAIL;
> > +} else {
> > +flags &= ~VRING_DESC_F_USED;
> > +flags &= ~VRING_DESC_F_AVAIL;
> > +}
> > +
> > +if (i > 0) {
> > +vq->desc_packed[vq->last_used_idx].flags = flags;
> > +
> > +vhost_log_cache_used_vring(dev, vq,
> > +vq->last_used_idx *
> > +sizeof(struct vring_packed_desc),
> > +sizeof(struct vring_packed_desc));
> > +} else {
> > +head_idx = vq->last_used_idx;
> > +head_flags = flags;
> > +}
> > +
> > +vq_inc_last_used_packed(vq, shadow_ring[i].count); }
> > +
> > +vq->desc_packed[head_idx].flags = head_flags;
> > +
> > +vhost_log_cache_used_vring(dev, vq,
> > +head_idx *
> > +sizeof(struct vring_packed_desc),
> > +sizeof(struct vring_packed_desc));
> > +
> > +vhost_log_cache_sync(dev, vq);
> 
> Async enqueue of packed ring has no support of live migration.
> The above code is not needed.

It will be removed.

> 
> > +}
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,  struct
> > vhost_virtqueue *vq, uint16_t queue_id, @@ -1633,12 +1710,292 @@
> > virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >  return pkt_idx;
> >  }
> >
> > +static __rte_always_inline int
> > +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> > +    struct vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt,
> > +    struct buf_vector *buf_vec,
> > +    uint16_t *nr_descs,
> > +    uint16_t *nr_buffers,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it,
> > +    struct rte_vhost_iov_iter *dst_it) { uint16_t nr_vec = 0;
> > +uint16_t avail_idx = vq->last_avail_idx; uint16_t max_tries, tries =
> > +0; uint16_t buf_id = 0; uint32_t len = 0; uint16_t desc_count;
> > +uint32_t size = pkt->pkt_len + sizeof(struct
> > virtio_net_hdr_mrg_rxbuf);
> > +uint32_t buffer_len[vq->size];
> > +uint16_t buffer_buf_id[vq->size];
> > +uint16_t buffer_desc_count[vq->size]; *nr_buffers = 0;
> > +
> > +if (rxvq_is_mergeable(dev))
> > +max_tries = vq->size - 1;
> > +else
> > +max_tries = 1;
> > +
> > +while (size > 0) {
> > +/*
> > + * if we tried all available ring items, and still
> > + * can't get enough buf, it means something abnormal
> > + * happened.
> > + */
> > +if (unlikely(++tries > max_tries))
> > +return -1;
> > +
> > +if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count,
> > +buf_vec, &nr_vec, &buf_id, &len,
> > +VHOST_ACCESS_RW) < 0))
> > +return -1;
> > +
> > +len = RTE_MIN(len, size);
> > +size -= len;
> > +
> > +buffer_len[*nr_buffers] = len;
> > +buffer_buf_id[*nr_buffers] = buf_id;
> > +buffer_desc_count[*nr_buffers] = desc_count; *nr_buffers += 1;
> > +
> > +*nr_descs += desc_count;
> > +avail_idx += desc_count;
> > +if (avail_idx >= vq->size)
> > +avail_idx -= vq->size;
> > +}
> > +
> > +if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> > +src_iovec, dst_iovec, src_it, dst_it) < 0) return -1;
> > +
> > +vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> > +   buffer_desc_count, *nr_buffers);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_always_inline int16_t
> > +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> > +    struct vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt,
> > +    uint16_t *nr_descs, uint16_t *nr_buffers,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it,
> > +    struct rte_vhost_iov_iter *dst_it) { struct buf_vector
> > +buf_vec[BUF_VECTOR_MAX]; *nr_descs = 0; *nr_buffers = 0;
> > +
> > +if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> > buf_vec,
> > + nr_descs,
> > + nr_buffers,
> > + src_iovec, dst_iovec,
> > + src_it, dst_it) < 0)) {
> > +VHOST_LOG_DATA(DEBUG,
> > +"(%d) failed to get enough desc from vring\n",
> > +dev->vid);
> > +return -1;
> > +}
> > +
> > +VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> > index %d\n",
> > +dev->vid, vq->last_avail_idx,
> > +vq->last_avail_idx + *nr_descs);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_noinline uint32_t
> > +virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct
> > +vhost_virtqueue *vq, uint16_t queue_id, struct rte_mbuf **pkts,
> > +uint32_t count, struct rte_mbuf **comp_pkts, uint32_t *comp_count) {
> > +uint32_t pkt_idx = 0, pkt_burst_idx = 0; uint16_t num_buffers;
> > +uint16_t num_desc;
> > +
> > +struct rte_vhost_iov_iter *it_pool = vq->it_pool; struct iovec
> > +*vec_pool = vq->vec_pool; struct rte_vhost_async_desc
> > +tdes[MAX_PKT_BURST]; struct iovec *src_iovec = vec_pool; struct iovec
> > +*dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1); struct
> > +rte_vhost_iov_iter *src_it = it_pool; struct rte_vhost_iov_iter
> > +*dst_it = it_pool + 1; uint16_t slot_idx = 0; uint16_t segs_await =
> > +0; struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > +uint32_t n_pkts = 0, pkt_err = 0; uint32_t num_async_pkts = 0,
> > +num_done_pkts = 0;
> > +
> > +rte_prefetch0(&vq->desc[vq->last_avail_idx & (vq->size - 1)]);
> > +
> > +for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { if
> > +(unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
> > +&num_desc, &num_buffers, src_iovec, dst_iovec, src_it, dst_it) < 0))
> > +{ break; }
> > +
> > +VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> > index %d\n",
> > +dev->vid, vq->last_avail_idx,
> > +vq->last_avail_idx + num_desc);
> > +
> > +slot_idx = (vq->async_pkts_idx + num_async_pkts) & (vq->size - 1); if
> > +(src_it->count) { uint16_t from, to;
> > +
> > +async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> > +pkts_info[slot_idx].descs = num_desc; pkts_info[slot_idx].nr_buffers
> > += num_buffers; pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> > +num_async_pkts++; src_iovec += src_it->nr_segs; dst_iovec +=
> > +dst_it->nr_segs; src_it += 2; dst_it += 2; segs_await +=
> > +src_it->nr_segs;
> > +
> > +/**
> > + * recover shadow used ring and keep DMA-occupied
> > + * descriptors.
> > + */
> > +from = vq->shadow_used_idx - num_buffers;
> > +to = vq->async_packed_buffer_idx & (vq->size - 1);
> > +if (num_buffers + to <= vq->size) {
> > +rte_memcpy(&vq->async_buffers_packed[to],
> > +&vq->shadow_used_packed[from],
> > +num_buffers *
> > +sizeof(struct
> > vring_used_elem_packed));
> > +} else {
> > +int size = vq->size - to;
> > +
> > +rte_memcpy(&vq->async_buffers_packed[to],
> > +&vq->shadow_used_packed[from],
> > +size *
> > +sizeof(struct
> > vring_used_elem_packed));
> > +rte_memcpy(vq->async_buffers_packed,
> > +&vq->shadow_used_packed[from +
> > +size], (num_buffers - size) *
> > +sizeof(struct
> > vring_used_elem_packed));
> > +}
> > +vq->async_packed_buffer_idx += num_buffers;
> > +vq->shadow_used_idx -= num_buffers;
> > +} else
> > +comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> > +
> > +vq_inc_last_avail_packed(vq, num_desc);
> > +
> > +/*
> > + * conditions to trigger async device transfer:
> > + * - buffered packet number reaches transfer threshold
> > + * - unused async iov number is less than max vhost vector
> > + */
> > +if (unlikely(pkt_burst_idx >=
> > VHOST_ASYNC_BATCH_THRESHOLD ||
> > +((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> > +BUF_VECTOR_MAX))) {
> > +n_pkts = vq->async_ops.transfer_data(dev->vid,
> > +queue_id, tdes, 0, pkt_burst_idx);
> > +src_iovec = vec_pool;
> > +dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> > 1);
> > +src_it = it_pool;
> > +dst_it = it_pool + 1;
> > +segs_await = 0;
> > +vq->async_pkts_inflight_n += n_pkts;
> > +
> > +if (unlikely(n_pkts < pkt_burst_idx)) {
> > +/*
> > + * log error packets number here and do
> > actual
> > + * error processing when applications poll
> > + * completion
> > + */
> > +pkt_err = pkt_burst_idx - n_pkts;
> > +pkt_burst_idx = 0;
> > +pkt_idx++;
> > +break;
> > +}
> > +
> > +pkt_burst_idx = 0;
> > +}
> > +}
> > +
> > +if (pkt_burst_idx) {
> > +n_pkts = vq->async_ops.transfer_data(dev->vid,
> > +queue_id, tdes, 0, pkt_burst_idx);
> > +vq->async_pkts_inflight_n += n_pkts;
> > +
> > +if (unlikely(n_pkts < pkt_burst_idx))
> > +pkt_err = pkt_burst_idx - n_pkts;
> > +}
> > +
> > +do_data_copy_enqueue(dev, vq);
> > +
> > +if (unlikely(pkt_err)) {
> > +uint16_t buffers_err = 0;
> > +uint16_t async_buffer_idx;
> > +uint16_t i;
> > +
> > +num_async_pkts -= pkt_err;
> > +pkt_idx -= pkt_err;
> > +/* calculate the sum of buffers of DMA-error packets. */
> > +while (pkt_err-- > 0) {
> > +buffers_err +=
> > +pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
> > +slot_idx--;
> > +}
> > +
> > +vq->async_packed_buffer_idx -= buffers_err;
> > +async_buffer_idx = vq->async_packed_buffer_idx;
> > +/* set 0 to the length of descriptors of DMA-error packets */
> > +for (i = 0; i < buffers_err; i++) {
> > +vq->async_buffers_packed[(async_buffer_idx + i)
> > +& (vq->size - 1)].len = 0;
> > +}
> > +/* write back DMA-error descriptors to used ring */
> > +do {
> > +uint16_t from = async_buffer_idx & (vq->size - 1);
> > +uint16_t to = (from + buffers_err) & (vq->size - 1);
> > +
> > +if (to > from) {
> > +vhost_update_used_packed(dev, vq,
> > +vq->async_buffers_packed + from,
> > +to - from);
> > +buffers_err = 0;
> > +} else {
> > +vhost_update_used_packed(dev, vq,
> > +vq->async_buffers_packed + from,
> > +vq->size - from);
> > +buffers_err -= vq->size - from;
> > +}
> > +} while (buffers_err > 0);
> > +vhost_vring_call_packed(dev, vq);
> 
> Why notify front-end here?

The error handling method will be changed in the next version, so this notification will be removed.

> 
> > +num_done_pkts = pkt_idx - num_async_pkts;
> > +}
> > +
> > +vq->async_pkts_idx += num_async_pkts;
> > +*comp_count = num_done_pkts;
> > +
> > +if (likely(vq->shadow_used_idx)) {
> > +vhost_flush_enqueue_shadow_packed(dev, vq);
> > +vhost_vring_call_packed(dev, vq);
> > +}
> > +
> > +return pkt_idx;
> > +}
> 
> virtio_dev_rx_async_submit_packed is too long and it has several parts are
> similar with split ring. I think you need to abstract common parts into inline
> functions to make the code easier to read.

I'm not sure which parts can be easily processed into functions. Maybe we can have a discussion offline. 

> 
> > +
> >  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >  struct rte_mbuf **pkts, uint16_t count)
> >  {
> >  struct virtio_net *dev = get_device(vid);
> >  struct vhost_virtqueue *vq;
> > -uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
> > +uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
> >  uint16_t start_idx, pkts_idx, vq_size;
> >  struct async_inflight_info *pkts_info;
> >  uint16_t from, i;
> > @@ -1680,53 +2037,96 @@ uint16_t
> > rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >  goto done;
> >  }
> >
> > -for (i = 0; i < n_pkts_put; i++) {
> > -from = (start_idx + i) & (vq_size - 1);
> > -n_descs += pkts_info[from].descs;
> > -pkts[i] = pkts_info[from].mbuf;
> > +if (vq_is_packed(dev)) {
> > +for (i = 0; i < n_pkts_put; i++) {
> > +from = (start_idx + i) & (vq_size - 1);
> > +n_buffers += pkts_info[from].nr_buffers;
> > +pkts[i] = pkts_info[from].mbuf;
> > +}
> > +} else {
> > +for (i = 0; i < n_pkts_put; i++) {
> > +from = (start_idx + i) & (vq_size - 1);
> > +n_descs += pkts_info[from].descs;
> > +pkts[i] = pkts_info[from].mbuf;
> > +}
> >  }
> > +
> >  vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
> >  vq->async_pkts_inflight_n -= n_pkts_put;
> >
> >  if (likely(vq->enabled && vq->access_ok)) {
> > -uint16_t nr_left = n_descs;
> >  uint16_t nr_copy;
> >  uint16_t to;
> >
> >  /* write back completed descriptors to used ring */
> > -do {
> > -from = vq->last_async_desc_idx & (vq->size - 1);
> > -nr_copy = nr_left + from <= vq->size ? nr_left :
> > -vq->size - from;
> > -to = vq->last_used_idx & (vq->size - 1);
> > -
> > -if (to + nr_copy <= vq->size) {
> > -rte_memcpy(&vq->used->ring[to],
> > +if (vq_is_packed(dev)) {
> > +uint16_t nr_left = n_buffers;
> > +uint16_t to;
> > +do {
> > +from = vq->last_async_buffer_idx &
> > +(vq->size - 1);
> > +to = (from + nr_left) & (vq->size - 1);
> > +
> > +if (to > from) {
> > +vhost_update_used_packed(dev, vq,
> > +vq->async_buffers_packed +
> > from,
> > +to - from);
> > +vq->last_async_buffer_idx += nr_left;
> > +nr_left = 0;
> > +} else {
> > +vhost_update_used_packed(dev, vq,
> > +vq->async_buffers_packed +
> > from,
> > +vq->size - from);
> > +vq->last_async_buffer_idx +=
> > +vq->size -
> > from;
> > +nr_left -= vq->size - from;
> > +}
> > +} while (nr_left > 0);
> > +vhost_vring_call_packed(dev, vq);
> > +} else {
> > +uint16_t nr_left = n_descs;
> > +do {
> > +from = vq->last_async_desc_idx & (vq->size -
> > 1);
> > +nr_copy = nr_left + from <= vq->size ? nr_left :
> > +vq->size - from;
> > +to = vq->last_used_idx & (vq->size - 1);
> > +
> > +if (to + nr_copy <= vq->size) {
> > +rte_memcpy(&vq->used->ring[to],
> >  &vq-
> > >async_descs_split[from],
> >  nr_copy *
> >  sizeof(struct
> > vring_used_elem));
> > -} else {
> > -uint16_t size = vq->size - to;
> > +} else {
> > +uint16_t size = vq->size - to;
> >
> > -rte_memcpy(&vq->used->ring[to],
> > +rte_memcpy(&vq->used->ring[to],
> >  &vq-
> > >async_descs_split[from],
> >  size *
> >  sizeof(struct
> > vring_used_elem));
> > -rte_memcpy(vq->used->ring,
> > +rte_memcpy(vq->used->ring,
> >  &vq->async_descs_split[from
> > +
> >  size], (nr_copy - size) *
> >  sizeof(struct
> > vring_used_elem));
> > -}
> > +}
> > +
> > +vq->last_async_desc_idx += nr_copy;
> > +vq->last_used_idx += nr_copy;
> > +nr_left -= nr_copy;
> > +} while (nr_left > 0);
> > +
> > +__atomic_add_fetch(&vq->used->idx, n_descs,
> > +__ATOMIC_RELEASE);
> > +vhost_vring_call_split(dev, vq);
> > +}
> >
> > -vq->last_async_desc_idx += nr_copy;
> > -vq->last_used_idx += nr_copy;
> > -nr_left -= nr_copy;
> > -} while (nr_left > 0);
> >
> > -__atomic_add_fetch(&vq->used->idx, n_descs,
> > __ATOMIC_RELEASE);
> > -vhost_vring_call_split(dev, vq);
> > -} else
> > -vq->last_async_desc_idx += n_descs;
> > +
> > +} else {
> > +if (vq_is_packed(dev))
> > +vq->last_async_buffer_idx += n_buffers;
> > +else
> > +vq->last_async_desc_idx += n_descs;
> > +}
> 
> rte_vhost_poll_enqueue_completed is too long and not easy to read. Save
> suggestion
> as above.
> 

I can try to process some code into functions, but I'm not sure if this is necessary, I will discuss it with you later.

Thanks,
Cheng

> Thanks,
> Jiayu
> 
> >
> >  done:
> >  rte_spinlock_unlock(&vq->access_lock);
> > @@ -1767,9 +2167,10 @@ virtio_dev_rx_async_submit(struct virtio_net
> > *dev, uint16_t queue_id,
> >  if (count == 0)
> >  goto out;
> >
> > -/* TODO: packed queue not implemented */
> >  if (vq_is_packed(dev))
> > -nb_tx = 0;
> > +nb_tx = virtio_dev_rx_async_submit_packed(dev,
> > +vq, queue_id, pkts, count, comp_pkts,
> > +comp_count);
> >  else
> >  nb_tx = virtio_dev_rx_async_submit_split(dev,
> >  vq, queue_id, pkts, count, comp_pkts,
> > --
> > 2.29.2
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v4 0/4] add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
  2021-03-22  6:15 ` [dpdk-dev] [PATCH v2] " Cheng Jiang
  2021-03-31 14:06 ` [dpdk-dev] [PATCH v3] " Cheng Jiang
@ 2021-04-10 10:25 ` Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
                     ` (4 more replies)
  2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
                   ` (4 subsequent siblings)
  7 siblings, 5 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-10 10:25 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
set cleans async split ring codes and enables packed ring in async
vhost data path. Batch datapath is also enabled in packed ring.

v4:
  * change the patch structure
  * clean code for async split ring
  * reuse some code from split ring
  * change the error handler for DMA-copy packet
  * add check for malloc
  * remove useless code
  * add doc update
v3:
  * fix error handler for DMA-copy packet
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

Cheng Jiang (4):
  vhost: abstract and reorganize async split ring code
  vhost: add support for packed ring in async vhost
  vhost: add batch datapath for async vhost packed ring
  doc: add release note for vhost async packed ring

 doc/guides/rel_notes/release_21_05.rst |   4 +
 lib/librte_vhost/rte_vhost_async.h     |   1 +
 lib/librte_vhost/vhost.c               |  27 +-
 lib/librte_vhost/vhost.h               |   7 +-
 lib/librte_vhost/virtio_net.c          | 603 ++++++++++++++++++++++---
 5 files changed, 560 insertions(+), 82 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] vhost: abstract and reorganize async split ring code
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
@ 2021-04-10 10:25   ` Cheng Jiang
  2021-04-10 10:25   ` Cheng Jiang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-10 10:25 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

In order to improve code efficiency and readability when async packed
ring support is enabled. This patch abstract some functions like
shadow_ring_store and write_back_completed_descs_split. And improve
the efficiency of some pointer offset calculation.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 146 +++++++++++++++++++---------------
 1 file changed, 84 insertions(+), 62 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ff3987860..69553e7c3 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
+static __rte_always_inline void
+shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
+		uint16_t s_idx, uint16_t d_idx,
+		uint16_t count, uint16_t elem_size)
+{
+	if (s_idx + count <= vq->size) {
+		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
+			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
+			count * elem_size);
+	} else {
+		uint16_t size = vq->size - d_idx;
+
+		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
+			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
+			size * elem_size);
+
+		rte_memcpy((void *)((uintptr_t)d_ring),
+			(void *)((uintptr_t)shadow_ring +
+				(s_idx + size) * elem_size),
+			(count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1478,6 +1501,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1513,27 +1537,32 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
 				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+				&src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx],
+				&src_it[it_idx],
+				&dst_it[it_idx]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (src_it[it_idx].count) {
 			uint16_t from, to;
 
-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&src_it[it_idx],
+				&dst_it[it_idx]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += src_it[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += src_it[it_idx].nr_segs;
 
 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1570,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			shadow_ring_store(vq, vq->shadow_used_split,
+					vq->async_descs_split,
+					from, to, num_buffers,
+					sizeof(struct vring_used_elem));
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
@@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left :
+			vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to],
+					&vq->async_descs_split[from],
+					nr_copy *
+					sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to],
+					&vq->async_descs_split[from],
+					size *
+					sizeof(struct vring_used_elem));
+			rte_memcpy(vq->used->ring,
+					&vq->async_descs_split[from +
+					size], (nr_copy - size) *
+					sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1749,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);
 
 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] vhost: abstract and reorganize async split ring code
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-10 10:25   ` Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-10 10:25 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

In order to improve code efficiency and readability when async packed
ring support is enabled. This patch abstract some functions like
shadow_ring_store and write_back_completed_descs_split. And improve
the efficiency of some pointer offset calculation.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 146 +++++++++++++++++++---------------
 1 file changed, 84 insertions(+), 62 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ff3987860..69553e7c3 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
+static __rte_always_inline void
+shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
+		uint16_t s_idx, uint16_t d_idx,
+		uint16_t count, uint16_t elem_size)
+{
+	if (s_idx + count <= vq->size) {
+		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
+			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
+			count * elem_size);
+	} else {
+		uint16_t size = vq->size - d_idx;
+
+		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
+			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
+			size * elem_size);
+
+		rte_memcpy((void *)((uintptr_t)d_ring),
+			(void *)((uintptr_t)shadow_ring +
+				(s_idx + size) * elem_size),
+			(count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1478,6 +1501,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1513,27 +1537,32 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
 				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+				&src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx],
+				&src_it[it_idx],
+				&dst_it[it_idx]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (src_it[it_idx].count) {
 			uint16_t from, to;
 
-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&src_it[it_idx],
+				&dst_it[it_idx]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += src_it[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += src_it[it_idx].nr_segs;
 
 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1570,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			shadow_ring_store(vq, vq->shadow_used_split,
+					vq->async_descs_split,
+					from, to, num_buffers,
+					sizeof(struct vring_used_elem));
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
@@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left :
+			vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to],
+					&vq->async_descs_split[from],
+					nr_copy *
+					sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to],
+					&vq->async_descs_split[from],
+					size *
+					sizeof(struct vring_used_elem));
+			rte_memcpy(vq->used->ring,
+					&vq->async_descs_split[from +
+					size], (nr_copy - size) *
+					sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1749,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);
 
 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v4 2/4] vhost: add support for packed ring in async vhost
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-10 10:25   ` Cheng Jiang
@ 2021-04-10 10:25   ` Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 4/4] doc: add release note for vhost async " Cheng Jiang
  4 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-10 10:25 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
enables packed ring in async vhost data path.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  27 +-
 lib/librte_vhost/vhost.h           |   7 +-
 lib/librte_vhost/virtio_net.c      | 428 ++++++++++++++++++++++++++++-
 4 files changed, 441 insertions(+), 22 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index a70fe01d8..8c9935c0f 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -342,15 +342,21 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
 	if (vq->async_pkts_info)
 		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
+	if (vq->async_buffers_packed) {
+		rte_free(vq->async_buffers_packed);
+		vq->async_buffers_packed = NULL;
+	}
+	if (vq->async_descs_split) {
 		rte_free(vq->async_descs_split);
+		vq->async_descs_split = NULL;
+	}
+
 	if (vq->it_pool)
 		rte_free(vq->it_pool);
 	if (vq->vec_pool)
 		rte_free(vq->vec_pool);
 
 	vq->async_pkts_info = NULL;
-	vq->async_descs_split = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -1627,9 +1633,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		return -1;
 
 	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1667,11 +1673,18 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
-		!vq->it_pool || !vq->vec_pool) {
+	}
+
+	if (!vq->async_buffers_packed || !vq->async_descs_split ||
+		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f628714c2..fe131ae8f 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -201,9 +201,14 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
 	uint16_t async_desc_idx;
+	uint16_t async_packed_buffer_idx;
 	uint16_t last_async_desc_idx;
+	uint16_t last_async_buffer_idx;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 69553e7c3..2b8b873ca 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,8 +363,7 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 				   uint32_t len[],
 				   uint16_t id[],
 				   uint16_t count[],
@@ -382,6 +381,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t len[],
+				   uint16_t id[],
+				   uint16_t count[],
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1481,6 +1491,61 @@ shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
 	}
 }
 
+static __rte_always_inline void
+vhost_update_used_packed(struct vhost_virtqueue *vq,
+			struct vring_used_elem_packed *shadow_ring,
+			uint16_t count)
+{
+	if (count == 0)
+		return;
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1656,6 +1721,293 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count = 0;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq,
+						avail_idx, &desc_count,
+						buf_vec, &nr_vec,
+						&buf_id, &len,
+						VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
+		src_iovec, dst_iovec, src_it, dst_it) < 0)
+		return -1;
+	/* store descriptors for DMA */
+	if (avail_idx >= *nr_descs)
+		rte_memcpy(async_descs,
+			&vq->desc_packed[vq->last_avail_idx],
+			*nr_descs * sizeof(struct vring_packed_desc));
+	else {
+		uint16_t nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(async_descs,
+			&vq->desc_packed[vq->last_avail_idx],
+			nr_copy * sizeof(struct vring_packed_desc));
+		rte_memcpy(async_descs + nr_copy,
+			vq->desc_packed, (*nr_descs - nr_copy) *
+			sizeof(struct vring_packed_desc));
+	}
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					   buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec,
+						 nr_descs,
+						 nr_buffers,
+						 async_descs,
+						 src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG,
+				"(%d) failed to get enough desc from vring\n",
+				dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t async_descs_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_iov_iter *src_it = it_pool;
+	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct vring_packed_desc async_descs[vq->size];
+
+	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
+						pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						&async_descs[async_descs_idx],
+						&src_iovec[iovec_idx],
+						&dst_iovec[iovec_idx],
+						&src_it[it_idx],
+						&dst_it[it_idx]) < 0))
+			break;
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
+			(vq->size - 1);
+		if (src_it[it_idx].count) {
+			uint16_t from, to;
+
+			async_descs_idx += num_desc;
+			async_fill_desc(&tdes[pkt_burst_idx++], &src_it[it_idx],
+					&dst_it[it_idx]);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			iovec_idx += src_it[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += src_it->nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx & (vq->size - 1);
+			shadow_ring_store(vq, vq->shadow_used_packed,
+					vq->async_buffers_packed,
+					from, to, num_buffers,
+					sizeof(struct vring_used_elem_packed));
+
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
+			BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, pkt_burst_idx);
+			iovec_idx = 0;
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid,
+				queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t descs_err = 0;
+		uint16_t buffers_err = 0;
+
+		num_async_pkts -= pkt_err;
+		pkt_idx -= pkt_err;
+	/* calculate the sum of buffers and descs of DMA-error packets. */
+		while (pkt_err-- > 0) {
+			descs_err +=
+				pkts_info[slot_idx & (vq->size - 1)].descs;
+			buffers_err +=
+				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
+			slot_idx--;
+		}
+
+		vq->async_packed_buffer_idx -= buffers_err;
+
+		if (vq->last_avail_idx >= descs_err) {
+			vq->last_avail_idx -= descs_err;
+
+			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+				&async_descs[async_descs_idx - descs_err],
+				descs_err * sizeof(struct vring_packed_desc));
+		} else {
+			uint16_t nr_copy;
+
+			vq->last_avail_idx = vq->last_avail_idx + vq->size
+						- descs_err;
+			nr_copy = vq->size - vq->last_avail_idx;
+			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+				&async_descs[async_descs_idx - descs_err],
+				nr_copy * sizeof(struct vring_packed_desc));
+			descs_err -= nr_copy;
+			rte_memcpy(vq->desc_packed,
+				&async_descs[async_descs_idx - descs_err],
+				descs_err * sizeof(struct vring_packed_desc));
+			vq->avail_wrap_counter ^= 1;
+		}
+
+		num_done_pkts = pkt_idx - num_async_pkts;
+	}
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 {
@@ -1693,12 +2045,39 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 	} while (nr_left > 0);
 }
 
+static __rte_always_inline void
+write_back_completed_descs_packed(struct vhost_virtqueue *vq,
+				uint16_t n_buffers)
+{
+	uint16_t nr_left = n_buffers;
+	uint16_t from, to;
+	do {
+		from = vq->last_async_buffer_idx &
+						(vq->size - 1);
+		to = (from + nr_left) & (vq->size - 1);
+		if (to > from) {
+			vhost_update_used_packed(vq,
+				vq->async_buffers_packed + from,
+				to - from);
+			vq->last_async_buffer_idx += nr_left;
+			nr_left = 0;
+		} else {
+			vhost_update_used_packed(vq,
+				vq->async_buffers_packed + from,
+				vq->size - from);
+			vq->last_async_buffer_idx +=
+						vq->size - from;
+			nr_left -= vq->size - from;
+		}
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1740,21 +2119,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		write_back_completed_descs_split(vq, n_descs);
+		if (vq_is_packed(dev)) {
+			write_back_completed_descs_packed(vq, n_buffers);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			write_back_completed_descs_split(vq, n_descs);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1795,9 +2194,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v4 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
                     ` (2 preceding siblings ...)
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-10 10:25   ` Cheng Jiang
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 4/4] doc: add release note for vhost async " Cheng Jiang
  4 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-10 10:25 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

Add batch datapath for async vhost packed ring to improve the
performance of small packet.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 43 +++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 2b8b873ca..c98fe6dbb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1721,6 +1721,29 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
+{
+	uint16_t i;
+	uint32_t cpy_threshold = vq->async_threshold;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
+			return -1;
+	}
+	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			comp_pkts[(*pkt_done)++] = pkts[i];
+
+		return 0;
+	}
+
+	return -1;
+}
+
 static __rte_always_inline int
 vhost_enqueue_async_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
@@ -1844,6 +1867,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t remained = count;
 	uint16_t async_descs_idx = 0;
 	uint16_t num_buffers;
 	uint16_t num_desc;
@@ -1863,9 +1887,17 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
 	struct vring_packed_desc async_descs[vq->size];
 
-	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]);
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+	do {
+		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx &
+							(vq->size - 1)]);
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_batch_packed(dev, vq,
+				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
+				pkt_idx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
 						pkts[pkt_idx],
 						&num_desc, &num_buffers,
@@ -1912,6 +1944,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 		} else
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 
+		pkt_idx++;
+		remained--;
 		vq_inc_last_avail_packed(vq, num_desc);
 
 		/*
@@ -1937,13 +1971,12 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 				 */
 				pkt_err = pkt_burst_idx - n_pkts;
 				pkt_burst_idx = 0;
-				pkt_idx++;
 				break;
 			}
 
 			pkt_burst_idx = 0;
 		}
-	}
+	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v4 4/4] doc: add release note for vhost async packed ring
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
                     ` (3 preceding siblings ...)
  2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-10 10:25   ` Cheng Jiang
  4 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-10 10:25 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, Cheng Jiang

Add release note for the support of vhost async packed ring.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 374d6d98e..eb5200669 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -131,6 +131,10 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added support for vhost async packed ring data path.**
+
+  Added packed ring support for async vhost.
+
 
 Removed Items
 -------------
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
                   ` (2 preceding siblings ...)
  2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
@ 2021-04-12 11:34 ` Cheng Jiang
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
                     ` (3 more replies)
  2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
                   ` (3 subsequent siblings)
  7 siblings, 4 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-12 11:34 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
set cleans async split ring codes and enables packed ring in async
vhost data path. Batch datapath is also enabled in async vhost
packed ring.

v5:
 * clean some codes for packed ring datapath
 * fix an index error in shadow_ring_store()
v4:
  * change the patch structure
  * clean code for async split ring
  * reuse some code from split ring
  * change the error handler for DMA-copy packet
  * add check for malloc
  * remove useless code
  * add doc update
v3:
  * fix error handler for DMA-copy packet
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

Cheng Jiang (4):
  vhost: abstract and reorganize async split ring code
  vhost: add support for packed ring in async vhost
  vhost: add batch datapath for async vhost packed ring
  doc: add release note for vhost async packed ring

 doc/guides/rel_notes/release_21_05.rst |   4 +
 lib/librte_vhost/rte_vhost_async.h     |   1 +
 lib/librte_vhost/vhost.c               |  27 +-
 lib/librte_vhost/vhost.h               |   7 +-
 lib/librte_vhost/virtio_net.c          | 613 +++++++++++++++++++++----
 5 files changed, 567 insertions(+), 85 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code
  2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-12 11:34   ` Cheng Jiang
  2021-04-13  2:44     ` Hu, Jiayu
  2021-04-13  7:11     ` Maxime Coquelin
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-12 11:34 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

In order to improve code efficiency and readability when async packed
ring support is enabled. This patch abstract some functions like
shadow_ring_store and write_back_completed_descs_split. And improve
the efficiency of some pointer offset calculation.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 146 +++++++++++++++++++---------------
 1 file changed, 84 insertions(+), 62 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ff3987860..c43ab0093 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
+static __rte_always_inline void
+shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
+		uint16_t s_idx, uint16_t d_idx,
+		uint16_t count, uint16_t elem_size)
+{
+	if (d_idx + count <= vq->size) {
+		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
+			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
+			count * elem_size);
+	} else {
+		uint16_t size = vq->size - d_idx;
+
+		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
+			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
+			size * elem_size);
+
+		rte_memcpy((void *)((uintptr_t)d_ring),
+			(void *)((uintptr_t)shadow_ring +
+				(s_idx + size) * elem_size),
+			(count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1478,6 +1501,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1513,27 +1537,32 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
 				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+				&src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx],
+				&src_it[it_idx],
+				&dst_it[it_idx]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (src_it[it_idx].count) {
 			uint16_t from, to;
 
-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&src_it[it_idx],
+				&dst_it[it_idx]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += src_it[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += src_it[it_idx].nr_segs;
 
 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1570,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			shadow_ring_store(vq, vq->shadow_used_split,
+					vq->async_descs_split,
+					from, to, num_buffers,
+					sizeof(struct vring_used_elem));
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
@@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left :
+			vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to],
+					&vq->async_descs_split[from],
+					nr_copy *
+					sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to],
+					&vq->async_descs_split[from],
+					size *
+					sizeof(struct vring_used_elem));
+			rte_memcpy(vq->used->ring,
+					&vq->async_descs_split[from +
+					size], (nr_copy - size) *
+					sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1749,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);
 
 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost
  2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-12 11:34   ` Cheng Jiang
  2021-04-13  8:36     ` Maxime Coquelin
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-04-12 11:34 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
enables packed ring in async vhost data path.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  27 +-
 lib/librte_vhost/vhost.h           |   7 +-
 lib/librte_vhost/virtio_net.c      | 438 +++++++++++++++++++++++++++--
 4 files changed, 448 insertions(+), 25 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index a70fe01d8..8c9935c0f 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -342,15 +342,21 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
 	if (vq->async_pkts_info)
 		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
+	if (vq->async_buffers_packed) {
+		rte_free(vq->async_buffers_packed);
+		vq->async_buffers_packed = NULL;
+	}
+	if (vq->async_descs_split) {
 		rte_free(vq->async_descs_split);
+		vq->async_descs_split = NULL;
+	}
+
 	if (vq->it_pool)
 		rte_free(vq->it_pool);
 	if (vq->vec_pool)
 		rte_free(vq->vec_pool);
 
 	vq->async_pkts_info = NULL;
-	vq->async_descs_split = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -1627,9 +1633,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		return -1;
 
 	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1667,11 +1673,18 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
-		!vq->it_pool || !vq->vec_pool) {
+	}
+
+	if (!vq->async_buffers_packed || !vq->async_descs_split ||
+		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f628714c2..fe131ae8f 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -201,9 +201,14 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
 	uint16_t async_desc_idx;
+	uint16_t async_packed_buffer_idx;
 	uint16_t last_async_desc_idx;
+	uint16_t last_async_buffer_idx;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index c43ab0093..410be9678 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,14 +363,14 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
-				   uint32_t len[],
-				   uint16_t id[],
-				   uint16_t count[],
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
 				   uint16_t num_buffers)
 {
 	uint16_t i;
+
 	for (i = 0; i < num_buffers; i++) {
 		/* enqueue shadow flush action aligned with batch num */
 		if (!vq->shadow_used_idx)
@@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1481,6 +1492,62 @@ shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
 	}
 }
 
+static __rte_always_inline void
+vhost_update_used_packed(struct vhost_virtqueue *vq,
+			struct vring_used_elem_packed *shadow_ring,
+			uint16_t count)
+{
+	if (count == 0)
+		return;
+
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1656,6 +1723,294 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count = 0;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq,
+						avail_idx, &desc_count,
+						buf_vec, &nr_vec,
+						&buf_id, &len,
+						VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
+		src_iovec, dst_iovec, src_it, dst_it) < 0)
+		return -1;
+	/* store descriptors for DMA */
+	if (avail_idx >= *nr_descs)
+		rte_memcpy(async_descs,
+			&vq->desc_packed[vq->last_avail_idx],
+			*nr_descs * sizeof(struct vring_packed_desc));
+	else {
+		uint16_t nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(async_descs,
+			&vq->desc_packed[vq->last_avail_idx],
+			nr_copy * sizeof(struct vring_packed_desc));
+		rte_memcpy(async_descs + nr_copy,
+			vq->desc_packed, (*nr_descs - nr_copy) *
+			sizeof(struct vring_packed_desc));
+	}
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
+					   buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec,
+						 nr_descs,
+						 nr_buffers,
+						 async_descs,
+						 src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG,
+				"(%d) failed to get enough desc from vring\n",
+				dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t async_descs_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_iov_iter *src_it = it_pool;
+	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct vring_packed_desc async_descs[vq->size];
+
+	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
+						pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						&async_descs[async_descs_idx],
+						&src_iovec[iovec_idx],
+						&dst_iovec[iovec_idx],
+						&src_it[it_idx],
+						&dst_it[it_idx]) < 0))
+			break;
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
+			(vq->size - 1);
+		if (src_it[it_idx].count) {
+			uint16_t from, to;
+
+			async_descs_idx += num_desc;
+			async_fill_desc(&tdes[pkt_burst_idx++], &src_it[it_idx],
+					&dst_it[it_idx]);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			iovec_idx += src_it[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += src_it[it_idx].nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx & (vq->size - 1);
+			shadow_ring_store(vq, vq->shadow_used_packed,
+					vq->async_buffers_packed,
+					from, to, num_buffers,
+					sizeof(struct vring_used_elem_packed));
+
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
+			BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, pkt_burst_idx);
+			iovec_idx = 0;
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid,
+				queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t descs_err = 0;
+		uint16_t buffers_err = 0;
+
+		num_async_pkts -= pkt_err;
+		pkt_idx -= pkt_err;
+	/* calculate the sum of buffers and descs of DMA-error packets. */
+		while (pkt_err-- > 0) {
+			descs_err +=
+				pkts_info[slot_idx & (vq->size - 1)].descs;
+			buffers_err +=
+				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;
+			slot_idx--;
+		}
+
+		vq->async_packed_buffer_idx -= buffers_err;
+
+		if (vq->last_avail_idx >= descs_err) {
+			vq->last_avail_idx -= descs_err;
+
+			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+				&async_descs[async_descs_idx - descs_err],
+				descs_err * sizeof(struct vring_packed_desc));
+		} else {
+			uint16_t nr_copy;
+
+			vq->last_avail_idx = vq->last_avail_idx + vq->size
+						- descs_err;
+			nr_copy = vq->size - vq->last_avail_idx;
+			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+				&async_descs[async_descs_idx - descs_err],
+				nr_copy * sizeof(struct vring_packed_desc));
+			descs_err -= nr_copy;
+			rte_memcpy(vq->desc_packed,
+				&async_descs[async_descs_idx - descs_err],
+				descs_err * sizeof(struct vring_packed_desc));
+			vq->avail_wrap_counter ^= 1;
+		}
+
+		num_done_pkts = pkt_idx - num_async_pkts;
+	}
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 {
@@ -1693,12 +2048,40 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 	} while (nr_left > 0);
 }
 
+static __rte_always_inline void
+write_back_completed_descs_packed(struct vhost_virtqueue *vq,
+				uint16_t n_buffers)
+{
+	uint16_t nr_left = n_buffers;
+	uint16_t from, to;
+
+	do {
+		from = vq->last_async_buffer_idx &
+						(vq->size - 1);
+		to = (from + nr_left) & (vq->size - 1);
+		if (to > from) {
+			vhost_update_used_packed(vq,
+				vq->async_buffers_packed + from,
+				to - from);
+			vq->last_async_buffer_idx += nr_left;
+			nr_left = 0;
+		} else {
+			vhost_update_used_packed(vq,
+				vq->async_buffers_packed + from,
+				vq->size - from);
+			vq->last_async_buffer_idx +=
+						vq->size - from;
+			nr_left -= vq->size - from;
+		}
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1740,21 +2123,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		write_back_completed_descs_split(vq, n_descs);
+		if (vq_is_packed(dev)) {
+			write_back_completed_descs_packed(vq, n_buffers);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			write_back_completed_descs_split(vq, n_descs);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1795,9 +2198,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v5 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-12 11:34   ` Cheng Jiang
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-12 11:34 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add batch datapath for async vhost packed ring to improve the
performance of small packet.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 43 +++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 410be9678..854f7afd6 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1723,6 +1723,29 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
+{
+	uint16_t i;
+	uint32_t cpy_threshold = vq->async_threshold;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
+			return -1;
+	}
+	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			comp_pkts[(*pkt_done)++] = pkts[i];
+
+		return 0;
+	}
+
+	return -1;
+}
+
 static __rte_always_inline int
 vhost_enqueue_async_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
@@ -1846,6 +1869,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t remained = count;
 	uint16_t async_descs_idx = 0;
 	uint16_t num_buffers;
 	uint16_t num_desc;
@@ -1865,9 +1889,17 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
 	struct vring_packed_desc async_descs[vq->size];
 
-	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]);
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+	do {
+		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx &
+							(vq->size - 1)]);
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_batch_packed(dev, vq,
+				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
+				pkt_idx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
 						pkts[pkt_idx],
 						&num_desc, &num_buffers,
@@ -1915,6 +1947,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 		} else
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 
+		pkt_idx++;
+		remained--;
 		vq_inc_last_avail_packed(vq, num_desc);
 
 		/*
@@ -1940,13 +1974,12 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 				 */
 				pkt_err = pkt_burst_idx - n_pkts;
 				pkt_burst_idx = 0;
-				pkt_idx++;
 				break;
 			}
 
 			pkt_burst_idx = 0;
 		}
-	}
+	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v5 4/4] doc: add release note for vhost async packed ring
  2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
                     ` (2 preceding siblings ...)
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-12 11:34   ` Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-12 11:34 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add release note for the support of vhost async packed ring.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 374d6d98e..eb5200669 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -131,6 +131,10 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added support for vhost async packed ring data path.**
+
+  Added packed ring support for async vhost.
+
 
 Removed Items
 -------------
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-13  2:44     ` Hu, Jiayu
  2021-04-13  3:26       ` Jiang, Cheng1
  2021-04-13  7:11     ` Maxime Coquelin
  1 sibling, 1 reply; 60+ messages in thread
From: Hu, Jiayu @ 2021-04-13  2:44 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Cheng,

Some comments inline.

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Monday, April 12, 2021 7:34 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v5 1/4] vhost: abstract and reorganize async split ring code
> 
> In order to improve code efficiency and readability when async packed
> ring support is enabled. This patch abstract some functions like
> shadow_ring_store and write_back_completed_descs_split. And improve
> the efficiency of some pointer offset calculation.

Need to improve grammar for commit log, as there is typo and incomplete
sentence.

> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/virtio_net.c | 146 +++++++++++++++++++---------------
>  1 file changed, 84 insertions(+), 62 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index ff3987860..c43ab0093 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t
> pkts_idx,
>  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
>  }
> 
> +static __rte_always_inline void
> +shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void
> *d_ring,
> +		uint16_t s_idx, uint16_t d_idx,
> +		uint16_t count, uint16_t elem_size)
> +{
> +	if (d_idx + count <= vq->size) {
> +		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
> +			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
> +			count * elem_size);
> +	} else {
> +		uint16_t size = vq->size - d_idx;
> +
> +		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
> +			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
> +			size * elem_size);
> +
> +		rte_memcpy((void *)((uintptr_t)d_ring),
> +			(void *)((uintptr_t)shadow_ring +
> +				(s_idx + size) * elem_size),
> +			(count - size) * elem_size);
> +	}
> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1478,6 +1501,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net
> *dev,
>  	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
>  	uint16_t slot_idx = 0;
>  	uint16_t segs_await = 0;
> +	uint16_t iovec_idx = 0, it_idx = 0;
>  	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>  	uint32_t n_pkts = 0, pkt_err = 0;
>  	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> @@ -1513,27 +1537,32 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> 
>  		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
>  				buf_vec, nr_vec, num_buffers,
> -				src_iovec, dst_iovec, src_it, dst_it) < 0) {
> +				&src_iovec[iovec_idx],
> +				&dst_iovec[iovec_idx],
> +				&src_it[it_idx],
> +				&dst_it[it_idx]) < 0) {

When use index, it's strange to get src and dst iov_iter from dst_it and src_it respectively,
as they are not start addresses of two separated iov_iter arrays but have overlapped
elements. IMO, there is no need to use src/dst_it, as they can be simply indexed by
it_pool[it_idx] and it_pool[it_idx+1].

>  			vq->shadow_used_idx -= num_buffers;
>  			break;
>  		}
> 
>  		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
>  			(vq->size - 1);
> -		if (src_it->count) {
> +		if (src_it[it_idx].count) {
>  			uint16_t from, to;
> 
> -			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> +			async_fill_desc(&tdes[pkt_burst_idx++],
> +				&src_it[it_idx],
> +				&dst_it[it_idx]);
>  			pkts_info[slot_idx].descs = num_buffers;
>  			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
>  			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
>  			async_pkts_log[num_async_pkts++].last_avail_idx =
>  				vq->last_avail_idx;
> -			src_iovec += src_it->nr_segs;
> -			dst_iovec += dst_it->nr_segs;
> -			src_it += 2;
> -			dst_it += 2;
> -			segs_await += src_it->nr_segs;
> +
> +			iovec_idx += src_it[it_idx].nr_segs;
> +			it_idx += 2;
> +
> +			segs_await += src_it[it_idx].nr_segs;
> 
>  			/**
>  			 * recover shadow used ring and keep DMA-occupied
> @@ -1541,23 +1570,12 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  			 */
>  			from = vq->shadow_used_idx - num_buffers;
>  			to = vq->async_desc_idx & (vq->size - 1);
> -			if (num_buffers + to <= vq->size) {
> -				rte_memcpy(&vq->async_descs_split[to],
> -						&vq-
> >shadow_used_split[from],
> -						num_buffers *
> -						sizeof(struct
> vring_used_elem));
> -			} else {
> -				int size = vq->size - to;
> -
> -				rte_memcpy(&vq->async_descs_split[to],
> -						&vq-
> >shadow_used_split[from],
> -						size *
> -						sizeof(struct
> vring_used_elem));
> -				rte_memcpy(vq->async_descs_split,
> -						&vq-
> >shadow_used_split[from +
> -						size], (num_buffers - size) *
> -					   sizeof(struct vring_used_elem));
> -			}
> +
> +			shadow_ring_store(vq, vq->shadow_used_split,
> +					vq->async_descs_split,
> +					from, to, num_buffers,
> +					sizeof(struct vring_used_elem));

This function is to store DMA-occupied desc, but " shadow_ring_store" is not a good name
for it. In addition, I think there is no need to pass vq as a parameter. What you need is the
size of shadow ring and async desc ring.

Thanks,
Jiayu
> +
>  			vq->async_desc_idx += num_buffers;
>  			vq->shadow_used_idx -= num_buffers;
>  		} else
> @@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  			BUF_VECTOR_MAX))) {
>  			n_pkts = vq->async_ops.transfer_data(dev->vid,
>  					queue_id, tdes, 0, pkt_burst_idx);
> -			src_iovec = vec_pool;
> -			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> -			src_it = it_pool;
> -			dst_it = it_pool + 1;
> +			iovec_idx = 0;
> +			it_idx = 0;
> +
>  			segs_await = 0;
>  			vq->async_pkts_inflight_n += n_pkts;
> 
> @@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	return pkt_idx;
>  }
> 
> +static __rte_always_inline void
> +write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
> n_descs)
> +{
> +	uint16_t nr_left = n_descs;
> +	uint16_t nr_copy;
> +	uint16_t to, from;
> +
> +	do {
> +		from = vq->last_async_desc_idx & (vq->size - 1);
> +		nr_copy = nr_left + from <= vq->size ? nr_left :
> +			vq->size - from;
> +		to = vq->last_used_idx & (vq->size - 1);
> +
> +		if (to + nr_copy <= vq->size) {
> +			rte_memcpy(&vq->used->ring[to],
> +					&vq->async_descs_split[from],
> +					nr_copy *
> +					sizeof(struct vring_used_elem));
> +		} else {
> +			uint16_t size = vq->size - to;
> +
> +			rte_memcpy(&vq->used->ring[to],
> +					&vq->async_descs_split[from],
> +					size *
> +					sizeof(struct vring_used_elem));
> +			rte_memcpy(vq->used->ring,
> +					&vq->async_descs_split[from +
> +					size], (nr_copy - size) *
> +					sizeof(struct vring_used_elem));
> +		}
> +
> +		vq->last_async_desc_idx += nr_copy;
> +		vq->last_used_idx += nr_copy;
> +		nr_left -= nr_copy;
> +	} while (nr_left > 0);
> +}
> +
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count)
>  {
> @@ -1695,39 +1749,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int
> vid, uint16_t queue_id,
>  	vq->async_pkts_inflight_n -= n_pkts_put;
> 
>  	if (likely(vq->enabled && vq->access_ok)) {
> -		uint16_t nr_left = n_descs;
> -		uint16_t nr_copy;
> -		uint16_t to;
> -
> -		/* write back completed descriptors to used ring */
> -		do {
> -			from = vq->last_async_desc_idx & (vq->size - 1);
> -			nr_copy = nr_left + from <= vq->size ? nr_left :
> -				vq->size - from;
> -			to = vq->last_used_idx & (vq->size - 1);
> -
> -			if (to + nr_copy <= vq->size) {
> -				rte_memcpy(&vq->used->ring[to],
> -						&vq-
> >async_descs_split[from],
> -						nr_copy *
> -						sizeof(struct
> vring_used_elem));
> -			} else {
> -				uint16_t size = vq->size - to;
> -
> -				rte_memcpy(&vq->used->ring[to],
> -						&vq-
> >async_descs_split[from],
> -						size *
> -						sizeof(struct
> vring_used_elem));
> -				rte_memcpy(vq->used->ring,
> -						&vq->async_descs_split[from
> +
> -						size], (nr_copy - size) *
> -						sizeof(struct
> vring_used_elem));
> -			}
> -
> -			vq->last_async_desc_idx += nr_copy;
> -			vq->last_used_idx += nr_copy;
> -			nr_left -= nr_copy;
> -		} while (nr_left > 0);
> +		write_back_completed_descs_split(vq, n_descs);
> 
>  		__atomic_add_fetch(&vq->used->idx, n_descs,
> __ATOMIC_RELEASE);
>  		vhost_vring_call_split(dev, vq);
> --
> 2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code
  2021-04-13  2:44     ` Hu, Jiayu
@ 2021-04-13  3:26       ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-13  3:26 UTC (permalink / raw)
  To: Hu, Jiayu, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Jiayu,

> -----Original Message-----
> From: Hu, Jiayu <jiayu.hu@intel.com>
> Sent: Tuesday, April 13, 2021 10:44 AM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Yang, YvonneX <yvonnex.yang@intel.com>; Wang, Yinan
> <yinan.wang@intel.com>; Liu, Yong <yong.liu@intel.com>
> Subject: RE: [PATCH v5 1/4] vhost: abstract and reorganize async split ring
> code
> 
> Hi Cheng,
> 
> Some comments inline.
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Monday, April 12, 2021 7:34 PM
> > To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> > <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> > Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Subject: [PATCH v5 1/4] vhost: abstract and reorganize async split
> > ring code
> >
> > In order to improve code efficiency and readability when async packed
> > ring support is enabled. This patch abstract some functions like
> > shadow_ring_store and write_back_completed_descs_split. And improve
> > the efficiency of some pointer offset calculation.
> 
> Need to improve grammar for commit log, as there is typo and incomplete
> sentence.
> 

Sure, I'll fix it in the next version.

> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> >  lib/librte_vhost/virtio_net.c | 146
> > +++++++++++++++++++---------------
> >  1 file changed, 84 insertions(+), 62 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index ff3987860..c43ab0093 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t
> > pkts_idx,
> >  (vq_size - n_inflight + pkts_idx) & (vq_size - 1);  }
> >
> > +static __rte_always_inline void
> > +shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring,
> > +void
> > *d_ring,
> > +uint16_t s_idx, uint16_t d_idx,
> > +uint16_t count, uint16_t elem_size)
> > +{
> > +if (d_idx + count <= vq->size) {
> > +rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size), (void
> > +*)((uintptr_t)shadow_ring + s_idx * elem_size), count * elem_size); }
> > +else { uint16_t size = vq->size - d_idx;
> > +
> > +rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size), (void
> > +*)((uintptr_t)shadow_ring + s_idx * elem_size), size * elem_size);
> > +
> > +rte_memcpy((void *)((uintptr_t)d_ring), (void
> > +*)((uintptr_t)shadow_ring + (s_idx + size) * elem_size), (count -
> > +size) * elem_size); } }
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,  struct
> > vhost_virtqueue *vq, uint16_t queue_id, @@ -1478,6 +1501,7 @@
> > virtio_dev_rx_async_submit_split(struct virtio_net *dev,  struct
> > rte_vhost_iov_iter *dst_it = it_pool + 1;  uint16_t slot_idx = 0;
> > uint16_t segs_await = 0;
> > +uint16_t iovec_idx = 0, it_idx = 0;
> >  struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > uint32_t n_pkts = 0, pkt_err = 0;  uint32_t num_async_pkts = 0,
> > num_done_pkts = 0; @@ -1513,27 +1537,32 @@
> > virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >
> >  if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],  buf_vec, nr_vec,
> > num_buffers, -src_iovec, dst_iovec, src_it, dst_it) < 0) {
> > +&src_iovec[iovec_idx],
> > +&dst_iovec[iovec_idx],
> > +&src_it[it_idx],
> > +&dst_it[it_idx]) < 0) {
> 
> When use index, it's strange to get src and dst iov_iter from dst_it and src_it
> respectively, as they are not start addresses of two separated iov_iter arrays
> but have overlapped elements. IMO, there is no need to use src/dst_it, as
> they can be simply indexed by it_pool[it_idx] and it_pool[it_idx+1].

Yes, I think it make sense, I'll fix it in the next version.

> 
> >  vq->shadow_used_idx -= num_buffers;
> >  break;
> >  }
> >
> >  slot_idx = (vq->async_pkts_idx + num_async_pkts) &  (vq->size - 1);
> > -if (src_it->count) {
> > +if (src_it[it_idx].count) {
> >  uint16_t from, to;
> >
> > -async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> > +async_fill_desc(&tdes[pkt_burst_idx++],
> > +&src_it[it_idx],
> > +&dst_it[it_idx]);
> >  pkts_info[slot_idx].descs = num_buffers;  pkts_info[slot_idx].mbuf =
> > pkts[pkt_idx];  async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
> > async_pkts_log[num_async_pkts++].last_avail_idx =  vq->last_avail_idx;
> > -src_iovec += src_it->nr_segs; -dst_iovec += dst_it->nr_segs; -src_it
> > += 2; -dst_it += 2; -segs_await += src_it->nr_segs;
> > +
> > +iovec_idx += src_it[it_idx].nr_segs;
> > +it_idx += 2;
> > +
> > +segs_await += src_it[it_idx].nr_segs;
> >
> >  /**
> >   * recover shadow used ring and keep DMA-occupied @@ -1541,23
> > +1570,12 @@ virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >   */
> >  from = vq->shadow_used_idx - num_buffers;  to = vq->async_desc_idx &
> > (vq->size - 1); -if (num_buffers + to <= vq->size) {
> > -rte_memcpy(&vq->async_descs_split[to],
> > -&vq-
> > >shadow_used_split[from],
> > -num_buffers *
> > -sizeof(struct
> > vring_used_elem));
> > -} else {
> > -int size = vq->size - to;
> > -
> > -rte_memcpy(&vq->async_descs_split[to],
> > -&vq-
> > >shadow_used_split[from],
> > -size *
> > -sizeof(struct
> > vring_used_elem));
> > -rte_memcpy(vq->async_descs_split,
> > -&vq-
> > >shadow_used_split[from +
> > -size], (num_buffers - size) *
> > -   sizeof(struct vring_used_elem));
> > -}
> > +
> > +shadow_ring_store(vq, vq->shadow_used_split,
> > +vq->async_descs_split,
> > +from, to, num_buffers,
> > +sizeof(struct vring_used_elem));
> 
> This function is to store DMA-occupied desc, but " shadow_ring_store" is not
> a good name for it. In addition, I think there is no need to pass vq as a
> parameter. What you need is the size of shadow ring and async desc ring.

Ok, I think we can use the name " store_dma_desc_info()".

Thanks a lot.
Cheng

> 
> Thanks,
> Jiayu
> > +
> >  vq->async_desc_idx += num_buffers;
> >  vq->shadow_used_idx -= num_buffers;
> >  } else
> > @@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >  BUF_VECTOR_MAX))) {
> >  n_pkts = vq->async_ops.transfer_data(dev->vid,
> >  queue_id, tdes, 0, pkt_burst_idx);
> > -src_iovec = vec_pool;
> > -dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1); -src_it = it_pool;
> > -dst_it = it_pool + 1;
> > +iovec_idx = 0;
> > +it_idx = 0;
> > +
> >  segs_await = 0;
> >  vq->async_pkts_inflight_n += n_pkts;
> >
> > @@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >  return pkt_idx;
> >  }
> >
> > +static __rte_always_inline void
> > +write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
> > n_descs)
> > +{
> > +uint16_t nr_left = n_descs;
> > +uint16_t nr_copy;
> > +uint16_t to, from;
> > +
> > +do {
> > +from = vq->last_async_desc_idx & (vq->size - 1); nr_copy = nr_left +
> > +from <= vq->size ? nr_left :
> > +vq->size - from;
> > +to = vq->last_used_idx & (vq->size - 1);
> > +
> > +if (to + nr_copy <= vq->size) {
> > +rte_memcpy(&vq->used->ring[to],
> > +&vq->async_descs_split[from],
> > +nr_copy *
> > +sizeof(struct vring_used_elem));
> > +} else {
> > +uint16_t size = vq->size - to;
> > +
> > +rte_memcpy(&vq->used->ring[to],
> > +&vq->async_descs_split[from],
> > +size *
> > +sizeof(struct vring_used_elem));
> > +rte_memcpy(vq->used->ring,
> > +&vq->async_descs_split[from +
> > +size], (nr_copy - size) *
> > +sizeof(struct vring_used_elem));
> > +}
> > +
> > +vq->last_async_desc_idx += nr_copy;
> > +vq->last_used_idx += nr_copy;
> > +nr_left -= nr_copy;
> > +} while (nr_left > 0);
> > +}
> > +
> >  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> > struct rte_mbuf **pkts, uint16_t count)  { @@ -1695,39 +1749,7 @@
> > uint16_t rte_vhost_poll_enqueue_completed(int
> > vid, uint16_t queue_id,
> >  vq->async_pkts_inflight_n -= n_pkts_put;
> >
> >  if (likely(vq->enabled && vq->access_ok)) { -uint16_t nr_left =
> > n_descs; -uint16_t nr_copy; -uint16_t to;
> > -
> > -/* write back completed descriptors to used ring */ -do { -from =
> > vq->last_async_desc_idx & (vq->size - 1); -nr_copy = nr_left + from <=
> > vq->size ? nr_left :
> > -vq->size - from;
> > -to = vq->last_used_idx & (vq->size - 1);
> > -
> > -if (to + nr_copy <= vq->size) {
> > -rte_memcpy(&vq->used->ring[to],
> > -&vq-
> > >async_descs_split[from],
> > -nr_copy *
> > -sizeof(struct
> > vring_used_elem));
> > -} else {
> > -uint16_t size = vq->size - to;
> > -
> > -rte_memcpy(&vq->used->ring[to],
> > -&vq-
> > >async_descs_split[from],
> > -size *
> > -sizeof(struct
> > vring_used_elem));
> > -rte_memcpy(vq->used->ring,
> > -&vq->async_descs_split[from
> > +
> > -size], (nr_copy - size) *
> > -sizeof(struct
> > vring_used_elem));
> > -}
> > -
> > -vq->last_async_desc_idx += nr_copy;
> > -vq->last_used_idx += nr_copy;
> > -nr_left -= nr_copy;
> > -} while (nr_left > 0);
> > +write_back_completed_descs_split(vq, n_descs);
> >
> >  __atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
> > vhost_vring_call_split(dev, vq);
> > --
> > 2.29.2
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-13  2:44     ` Hu, Jiayu
@ 2021-04-13  7:11     ` Maxime Coquelin
  2021-04-13  9:06       ` Jiang, Cheng1
  1 sibling, 1 reply; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-13  7:11 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu

Hi Cheng,

On 4/12/21 1:34 PM, Cheng Jiang wrote:
> In order to improve code efficiency and readability when async packed
> ring support is enabled. This patch abstract some functions like
> shadow_ring_store and write_back_completed_descs_split. And improve
> the efficiency of some pointer offset calculation.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/virtio_net.c | 146 +++++++++++++++++++---------------
>  1 file changed, 84 insertions(+), 62 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index ff3987860..c43ab0093 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
>  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
>  }
>  
> +static __rte_always_inline void
> +shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
> +		uint16_t s_idx, uint16_t d_idx,
> +		uint16_t count, uint16_t elem_size)
> +{
> +	if (d_idx + count <= vq->size) {
> +		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
> +			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
> +			count * elem_size);
> +	} else {
> +		uint16_t size = vq->size - d_idx;
> +
> +		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
> +			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
> +			size * elem_size);
> +
> +		rte_memcpy((void *)((uintptr_t)d_ring),
> +			(void *)((uintptr_t)shadow_ring +
> +				(s_idx + size) * elem_size),
> +			(count - size) * elem_size);
> +	}
> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1478,6 +1501,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
>  	uint16_t slot_idx = 0;
>  	uint16_t segs_await = 0;
> +	uint16_t iovec_idx = 0, it_idx = 0;
>  	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>  	uint32_t n_pkts = 0, pkt_err = 0;
>  	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> @@ -1513,27 +1537,32 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  
>  		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
>  				buf_vec, nr_vec, num_buffers,
> -				src_iovec, dst_iovec, src_it, dst_it) < 0) {
> +				&src_iovec[iovec_idx],
> +				&dst_iovec[iovec_idx],
> +				&src_it[it_idx],
> +				&dst_it[it_idx]) < 0) {
>  			vq->shadow_used_idx -= num_buffers;
>  			break;
>  		}
>  
>  		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
>  			(vq->size - 1);
> -		if (src_it->count) {
> +		if (src_it[it_idx].count) {
>  			uint16_t from, to;
>  
> -			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> +			async_fill_desc(&tdes[pkt_burst_idx++],
> +				&src_it[it_idx],
> +				&dst_it[it_idx]);
>  			pkts_info[slot_idx].descs = num_buffers;
>  			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
>  			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
>  			async_pkts_log[num_async_pkts++].last_avail_idx =
>  				vq->last_avail_idx;
> -			src_iovec += src_it->nr_segs;
> -			dst_iovec += dst_it->nr_segs;
> -			src_it += 2;
> -			dst_it += 2;
> -			segs_await += src_it->nr_segs;
> +
> +			iovec_idx += src_it[it_idx].nr_segs;
> +			it_idx += 2;
> +
> +			segs_await += src_it[it_idx].nr_segs;
>  
>  			/**
>  			 * recover shadow used ring and keep DMA-occupied
> @@ -1541,23 +1570,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  			 */
>  			from = vq->shadow_used_idx - num_buffers;
>  			to = vq->async_desc_idx & (vq->size - 1);
> -			if (num_buffers + to <= vq->size) {
> -				rte_memcpy(&vq->async_descs_split[to],
> -						&vq->shadow_used_split[from],
> -						num_buffers *
> -						sizeof(struct vring_used_elem));
> -			} else {
> -				int size = vq->size - to;
> -
> -				rte_memcpy(&vq->async_descs_split[to],
> -						&vq->shadow_used_split[from],
> -						size *
> -						sizeof(struct vring_used_elem));
> -				rte_memcpy(vq->async_descs_split,
> -						&vq->shadow_used_split[from +
> -						size], (num_buffers - size) *
> -					   sizeof(struct vring_used_elem));
> -			}
> +
> +			shadow_ring_store(vq, vq->shadow_used_split,
> +					vq->async_descs_split,
> +					from, to, num_buffers,
> +					sizeof(struct vring_used_elem));
> +

I'm not convinced with this rework.

I think it is good to create a dedicated function for this to simplify
this huge virtio_dev_rx_async_submit_split() function. But we should
have a dedicated version for split ring. Having a single function for
both split and packed ring does not improve readability, and unlikely
improve performance.

>  			vq->async_desc_idx += num_buffers;
>  			vq->shadow_used_idx -= num_buffers;
>  		} else
> @@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  			BUF_VECTOR_MAX))) {
>  			n_pkts = vq->async_ops.transfer_data(dev->vid,
>  					queue_id, tdes, 0, pkt_burst_idx);
> -			src_iovec = vec_pool;
> -			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> -			src_it = it_pool;
> -			dst_it = it_pool + 1;
> +			iovec_idx = 0;
> +			it_idx = 0;
> +
>  			segs_await = 0;
>  			vq->async_pkts_inflight_n += n_pkts;
>  
> @@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	return pkt_idx;
>  }
>  
> +static __rte_always_inline void
> +write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
> +{
> +	uint16_t nr_left = n_descs;
> +	uint16_t nr_copy;
> +	uint16_t to, from;
> +
> +	do {
> +		from = vq->last_async_desc_idx & (vq->size - 1);
> +		nr_copy = nr_left + from <= vq->size ? nr_left :
> +			vq->size - from;
> +		to = vq->last_used_idx & (vq->size - 1);
> +
> +		if (to + nr_copy <= vq->size) {
> +			rte_memcpy(&vq->used->ring[to],
> +					&vq->async_descs_split[from],
> +					nr_copy *
> +					sizeof(struct vring_used_elem));
> +		} else {
> +			uint16_t size = vq->size - to;
> +
> +			rte_memcpy(&vq->used->ring[to],
> +					&vq->async_descs_split[from],
> +					size *
> +					sizeof(struct vring_used_elem));
> +			rte_memcpy(vq->used->ring,
&vq->used->ring[0] for consistency
> +					&vq->async_descs_split[from +
> +					size], (nr_copy - size) *
> +					sizeof(struct vring_used_elem));

Lines can now be up to 100 chars.
Please take the opportunity to indent properly not to have parts of each
args being put on the same line. It will help readability.

> +		}
> +
> +		vq->last_async_desc_idx += nr_copy;
> +		vq->last_used_idx += nr_copy;
> +		nr_left -= nr_copy;
> +	} while (nr_left > 0);
> +}
> +
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count)
>  {
> @@ -1695,39 +1749,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  	vq->async_pkts_inflight_n -= n_pkts_put;
>  
>  	if (likely(vq->enabled && vq->access_ok)) {
> -		uint16_t nr_left = n_descs;
> -		uint16_t nr_copy;
> -		uint16_t to;
> -
> -		/* write back completed descriptors to used ring */
> -		do {
> -			from = vq->last_async_desc_idx & (vq->size - 1);
> -			nr_copy = nr_left + from <= vq->size ? nr_left :
> -				vq->size - from;
> -			to = vq->last_used_idx & (vq->size - 1);
> -
> -			if (to + nr_copy <= vq->size) {
> -				rte_memcpy(&vq->used->ring[to],
> -						&vq->async_descs_split[from],
> -						nr_copy *
> -						sizeof(struct vring_used_elem));
> -			} else {
> -				uint16_t size = vq->size - to;
> -
> -				rte_memcpy(&vq->used->ring[to],
> -						&vq->async_descs_split[from],
> -						size *
> -						sizeof(struct vring_used_elem));
> -				rte_memcpy(vq->used->ring,
> -						&vq->async_descs_split[from +
> -						size], (nr_copy - size) *
> -						sizeof(struct vring_used_elem));
> -			}
> -
> -			vq->last_async_desc_idx += nr_copy;
> -			vq->last_used_idx += nr_copy;
> -			nr_left -= nr_copy;
> -		} while (nr_left > 0);
> +		write_back_completed_descs_split(vq, n_descs);
>  
>  		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
>  		vhost_vring_call_split(dev, vq);
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost
  2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-13  8:36     ` Maxime Coquelin
  2021-04-13 11:48       ` Jiang, Cheng1
  0 siblings, 1 reply; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-13  8:36 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/12/21 1:34 PM, Cheng Jiang wrote:
> For now async vhost data path only supports split ring structure. In
> order to make async vhost compatible with virtio 1.1 spec this patch
> enables packed ring in async vhost data path.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |   1 +
>  lib/librte_vhost/vhost.c           |  27 +-
>  lib/librte_vhost/vhost.h           |   7 +-
>  lib/librte_vhost/virtio_net.c      | 438 +++++++++++++++++++++++++++--
>  4 files changed, 448 insertions(+), 25 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
> index c855ff875..6faa31f5a 100644
> --- a/lib/librte_vhost/rte_vhost_async.h
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
>  struct async_inflight_info {
>  	struct rte_mbuf *mbuf;
>  	uint16_t descs; /* num of descs inflight */
> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>  };
>  
>  /**
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index a70fe01d8..8c9935c0f 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -342,15 +342,21 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
>  {
>  	if (vq->async_pkts_info)
>  		rte_free(vq->async_pkts_info);
> -	if (vq->async_descs_split)
> +	if (vq->async_buffers_packed) {
> +		rte_free(vq->async_buffers_packed);
> +		vq->async_buffers_packed = NULL;
> +	}
> +	if (vq->async_descs_split) {

You can remove the check, rte_free is safe with NULL pointers.
You can do the same for the other ones in this function.

>  		rte_free(vq->async_descs_split);
> +		vq->async_descs_split = NULL;
> +	}
> +
>  	if (vq->it_pool)
>  		rte_free(vq->it_pool);
>  	if (vq->vec_pool)
>  		rte_free(vq->vec_pool);
>  
>  	vq->async_pkts_info = NULL;
> -	vq->async_descs_split = NULL;
>  	vq->it_pool = NULL;
>  	vq->vec_pool = NULL;
>  }
> @@ -1627,9 +1633,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  		return -1;
>  
>  	/* packed queue is not supported */
> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> +	if (unlikely(!f.async_inorder)) {
>  		VHOST_LOG_CONFIG(ERR,
> -			"async copy is not supported on packed queue or non-inorder mode "
> +			"async copy is not supported on non-inorder mode "
>  			"(vid %d, qid: %d)\n", vid, queue_id);
>  		return -1;
>  	}
> @@ -1667,11 +1673,18 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  	vq->vec_pool = rte_malloc_socket(NULL,
>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
>  			RTE_CACHE_LINE_SIZE, node);
> -	vq->async_descs_split = rte_malloc_socket(NULL,
> +	if (vq_is_packed(dev)) {
> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> +			vq->size * sizeof(struct vring_used_elem_packed),
> +			RTE_CACHE_LINE_SIZE, node);
> +	} else {
> +		vq->async_descs_split = rte_malloc_socket(NULL,
>  			vq->size * sizeof(struct vring_used_elem),
>  			RTE_CACHE_LINE_SIZE, node);
> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> -		!vq->it_pool || !vq->vec_pool) {
> +	}
> +
> +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
> +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
>  		vhost_free_async_mem(vq);
>  		VHOST_LOG_CONFIG(ERR,
>  				"async register failed: cannot allocate memory for vq data "
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index f628714c2..fe131ae8f 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -201,9 +201,14 @@ struct vhost_virtqueue {
>  	uint16_t	async_pkts_idx;
>  	uint16_t	async_pkts_inflight_n;
>  	uint16_t	async_last_pkts_n;
> -	struct vring_used_elem  *async_descs_split;
> +	union {
> +		struct vring_used_elem  *async_descs_split;
> +		struct vring_used_elem_packed *async_buffers_packed;
> +	};
>  	uint16_t async_desc_idx;
> +	uint16_t async_packed_buffer_idx;

Don't dupplicate variable names, async_desc_idx can be reused for packed
ring. Also, they are representing the same thing, why use desc in one
case and buffer in the other?

>  	uint16_t last_async_desc_idx;
> +	uint16_t last_async_buffer_idx;

Same remark here.

>  	/* vq async features */
>  	bool		async_inorder;
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index c43ab0093..410be9678 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -363,14 +363,14 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
>  }
>  
>  static __rte_always_inline void
> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> -				   struct vhost_virtqueue *vq,
> -				   uint32_t len[],
> -				   uint16_t id[],
> -				   uint16_t count[],
> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> +				   uint32_t *len,
> +				   uint16_t *id,
> +				   uint16_t *count,
>  				   uint16_t num_buffers)
>  {
>  	uint16_t i;
> +
>  	for (i = 0; i < num_buffers; i++) {
>  		/* enqueue shadow flush action aligned with batch num */
>  		if (!vq->shadow_used_idx)
> @@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
>  		vq->shadow_aligned_idx += count[i];
>  		vq->shadow_used_idx++;
>  	}
> +}
> +
> +static __rte_always_inline void
> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> +				   struct vhost_virtqueue *vq,
> +				   uint32_t *len,
> +				   uint16_t *id,
> +				   uint16_t *count,
> +				   uint16_t num_buffers)
> +{
> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
>  
>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
>  		do_data_copy_enqueue(dev, vq);
> @@ -1481,6 +1492,62 @@ shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void *d_ring,
>  	}
>  }
>  
> +static __rte_always_inline void
> +vhost_update_used_packed(struct vhost_virtqueue *vq,
> +			struct vring_used_elem_packed *shadow_ring,
> +			uint16_t count)
> +{
> +	if (count == 0)
> +		return;

Move this after the variables declaration.

> +
> +	int i;
> +	uint16_t used_idx = vq->last_used_idx;
> +	uint16_t head_idx = vq->last_used_idx;
> +	uint16_t head_flags = 0;
> +
> +	/* Split loop in two to save memory barriers */
> +	for (i = 0; i < count; i++) {
> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> +
> +		used_idx += shadow_ring[i].count;
> +		if (used_idx >= vq->size)
> +			used_idx -= vq->size;
> +	}
> +
> +	/* The ordering for storing desc flags needs to be enforced. */
> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> +
> +	for (i = 0; i < count; i++) {
> +		uint16_t flags;
> +
> +		if (vq->shadow_used_packed[i].len)
> +			flags = VRING_DESC_F_WRITE;
> +		else
> +			flags = 0;
> +
> +		if (vq->used_wrap_counter) {
> +			flags |= VRING_DESC_F_USED;
> +			flags |= VRING_DESC_F_AVAIL;
> +		} else {
> +			flags &= ~VRING_DESC_F_USED;
> +			flags &= ~VRING_DESC_F_AVAIL;
> +		}
> +
> +		if (i > 0) {
> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> +
> +		} else {
> +			head_idx = vq->last_used_idx;
> +			head_flags = flags;
> +		}
> +
> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> +	}
> +
> +	vq->desc_packed[head_idx].flags = head_flags;
> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1656,6 +1723,294 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	return pkt_idx;
>  }
>  
> +static __rte_always_inline int
> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    struct buf_vector *buf_vec,
> +			    uint16_t *nr_descs,
> +			    uint16_t *nr_buffers,
> +			    struct vring_packed_desc *async_descs,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	uint16_t nr_vec = 0;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint16_t max_tries, tries = 0;
> +	uint16_t buf_id = 0;
> +	uint32_t len = 0;
> +	uint16_t desc_count = 0;
> +	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	uint32_t buffer_len[vq->size];
> +	uint16_t buffer_buf_id[vq->size];
> +	uint16_t buffer_desc_count[vq->size];
> +	*nr_buffers = 0;
> +
> +	if (rxvq_is_mergeable(dev))
> +		max_tries = vq->size - 1;
> +	else
> +		max_tries = 1;
> +
> +	while (size > 0) {
> +		/*
> +		 * if we tried all available ring items, and still
> +		 * can't get enough buf, it means something abnormal
> +		 * happened.
> +		 */
> +		if (unlikely(++tries > max_tries))
> +			return -1;
> +
> +		if (unlikely(fill_vec_buf_packed(dev, vq,
> +						avail_idx, &desc_count,
> +						buf_vec, &nr_vec,
> +						&buf_id, &len,
> +						VHOST_ACCESS_RW) < 0))
> +			return -1;
> +
> +		len = RTE_MIN(len, size);
> +		size -= len;
> +
> +		buffer_len[*nr_buffers] = len;
> +		buffer_buf_id[*nr_buffers] = buf_id;
> +		buffer_desc_count[*nr_buffers] = desc_count;
> +		*nr_buffers += 1;
> +
> +		*nr_descs += desc_count;
> +		avail_idx += desc_count;
> +		if (avail_idx >= vq->size)
> +			avail_idx -= vq->size;
> +	}
> +
> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> +		src_iovec, dst_iovec, src_it, dst_it) < 0)
> +		return -1;
> +	/* store descriptors for DMA */
> +	if (avail_idx >= *nr_descs)
> +		rte_memcpy(async_descs,
> +			&vq->desc_packed[vq->last_avail_idx],
> +			*nr_descs * sizeof(struct vring_packed_desc));

Please add brackets for the 'if' since there are for the 'else'.

> +	else {
> +		uint16_t nr_copy = vq->size - vq->last_avail_idx;
> +		rte_memcpy(async_descs,
> +			&vq->desc_packed[vq->last_avail_idx],
> +			nr_copy * sizeof(struct vring_packed_desc));
> +		rte_memcpy(async_descs + nr_copy,
> +			vq->desc_packed, (*nr_descs - nr_copy) *
> +			sizeof(struct vring_packed_desc));
> +	}
> +
> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> +					   buffer_desc_count, *nr_buffers);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline int16_t
> +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    uint16_t *nr_descs, uint16_t *nr_buffers,
> +			    struct vring_packed_desc *async_descs,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> +	*nr_descs = 0;
> +	*nr_buffers = 0;
> +
> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec,
> +						 nr_descs,
> +						 nr_buffers,
> +						 async_descs,
> +						 src_iovec, dst_iovec,
> +						 src_it, dst_it) < 0)) {
> +		VHOST_LOG_DATA(DEBUG,
> +				"(%d) failed to get enough desc from vring\n",
> +				dev->vid);
> +		return -1;
> +	}
> +
> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
> +			dev->vid, vq->last_avail_idx,
> +			vq->last_avail_idx + *nr_descs);
> +
> +	return 0;
> +}
> +
> +static __rte_noinline uint32_t
> +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
> +	struct vhost_virtqueue *vq, uint16_t queue_id,
> +	struct rte_mbuf **pkts, uint32_t count,
> +	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
> +{
> +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> +	uint16_t async_descs_idx = 0;
> +	uint16_t num_buffers;
> +	uint16_t num_desc;
> +
> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> +	struct iovec *vec_pool = vq->vec_pool;
> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> +	struct iovec *src_iovec = vec_pool;
> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> +	struct rte_vhost_iov_iter *src_it = it_pool;
> +	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> +	uint16_t slot_idx = 0;
> +	uint16_t segs_await = 0;
> +	uint16_t iovec_idx = 0, it_idx = 0;
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> +	uint32_t n_pkts = 0, pkt_err = 0;
> +	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> +	struct vring_packed_desc async_descs[vq->size];
> +
> +	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]);

The size of the ring is not necessarily a power of two with packed ring.

> +
> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> +		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
> +						pkts[pkt_idx],
> +						&num_desc, &num_buffers,
> +						&async_descs[async_descs_idx],
> +						&src_iovec[iovec_idx],
> +						&dst_iovec[iovec_idx],
> +						&src_it[it_idx],
> +						&dst_it[it_idx]) < 0))
> +			break;
> +
> +		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
> +			dev->vid, vq->last_avail_idx,
> +			vq->last_avail_idx + num_desc);
> +
> +		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
> +			(vq->size - 1);

Same here.

> +		if (src_it[it_idx].count) {
> +			uint16_t from, to;
> +
> +			async_descs_idx += num_desc;
> +			async_fill_desc(&tdes[pkt_burst_idx++], &src_it[it_idx],
> +					&dst_it[it_idx]);
> +			pkts_info[slot_idx].descs = num_desc;
> +			pkts_info[slot_idx].nr_buffers = num_buffers;
> +			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> +			num_async_pkts++;
> +			iovec_idx += src_it[it_idx].nr_segs;
> +			it_idx += 2;
> +
> +			segs_await += src_it[it_idx].nr_segs;
> +
> +			/**
> +			 * recover shadow used ring and keep DMA-occupied
> +			 * descriptors.
> +			 */
> +			from = vq->shadow_used_idx - num_buffers;
> +			to = vq->async_packed_buffer_idx & (vq->size - 1);
> +			shadow_ring_store(vq, vq->shadow_used_packed,
> +					vq->async_buffers_packed,
> +					from, to, num_buffers,
> +					sizeof(struct vring_used_elem_packed));
> +
> +			vq->async_packed_buffer_idx += num_buffers;
> +			vq->shadow_used_idx -= num_buffers;
> +		} else

Brackets needed.

> +			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> +
> +		vq_inc_last_avail_packed(vq, num_desc);
> +
> +		/*
> +		 * conditions to trigger async device transfer:
> +		 * - buffered packet number reaches transfer threshold
> +		 * - unused async iov number is less than max vhost vector
> +		 */
> +		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
> +			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> +			BUF_VECTOR_MAX))) {
> +			n_pkts = vq->async_ops.transfer_data(dev->vid,
> +					queue_id, tdes, 0, pkt_burst_idx);
> +			iovec_idx = 0;
> +			it_idx = 0;
> +			segs_await = 0;
> +			vq->async_pkts_inflight_n += n_pkts;
> +
> +			if (unlikely(n_pkts < pkt_burst_idx)) {
> +				/*
> +				 * log error packets number here and do actual
> +				 * error processing when applications poll
> +				 * completion
> +				 */
> +				pkt_err = pkt_burst_idx - n_pkts;
> +				pkt_burst_idx = 0;
> +				pkt_idx++;
> +				break;
> +			}
> +
> +			pkt_burst_idx = 0;
> +		}
> +	}
> +
> +	if (pkt_burst_idx) {
> +		n_pkts = vq->async_ops.transfer_data(dev->vid,
> +				queue_id, tdes, 0, pkt_burst_idx);
> +		vq->async_pkts_inflight_n += n_pkts;
> +
> +		if (unlikely(n_pkts < pkt_burst_idx))
> +			pkt_err = pkt_burst_idx - n_pkts;
> +	}
> +
> +	do_data_copy_enqueue(dev, vq);
> +
> +	if (unlikely(pkt_err)) {
> +		uint16_t descs_err = 0;
> +		uint16_t buffers_err = 0;
> +
> +		num_async_pkts -= pkt_err;
> +		pkt_idx -= pkt_err;
> +	/* calculate the sum of buffers and descs of DMA-error packets. */
> +		while (pkt_err-- > 0) {
> +			descs_err +=
> +				pkts_info[slot_idx & (vq->size - 1)].descs;

The size of the ring is not necessarily a power of two with packed ring.

> +			buffers_err +=
> +				pkts_info[slot_idx & (vq->size - 1)].nr_buffers;

Ditto.

> +			slot_idx--;
> +		}
> +
> +		vq->async_packed_buffer_idx -= buffers_err;
> +
> +		if (vq->last_avail_idx >= descs_err) {
> +			vq->last_avail_idx -= descs_err;
> +
> +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> +				&async_descs[async_descs_idx - descs_err],
> +				descs_err * sizeof(struct vring_packed_desc));
> +		} else {
> +			uint16_t nr_copy;
> +
> +			vq->last_avail_idx = vq->last_avail_idx + vq->size
> +						- descs_err;
> +			nr_copy = vq->size - vq->last_avail_idx;
> +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> +				&async_descs[async_descs_idx - descs_err],
> +				nr_copy * sizeof(struct vring_packed_desc));
> +			descs_err -= nr_copy;
> +			rte_memcpy(vq->desc_packed,
> +				&async_descs[async_descs_idx - descs_err],
> +				descs_err * sizeof(struct vring_packed_desc));
> +			vq->avail_wrap_counter ^= 1;
> +		}
> +
> +		num_done_pkts = pkt_idx - num_async_pkts;
> +	}

This error handling could be moved in a dedicated function.

> +	vq->async_pkts_idx += num_async_pkts;
> +	*comp_count = num_done_pkts;
> +
> +	if (likely(vq->shadow_used_idx)) {
> +		vhost_flush_enqueue_shadow_packed(dev, vq);
> +		vhost_vring_call_packed(dev, vq);
> +	}
> +
> +	return pkt_idx;
> +}

Above function is very big and complex, it should be possible to split
it in several ones to make it maintainable.

> +
>  static __rte_always_inline void
>  write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
>  {
> @@ -1693,12 +2048,40 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
>  	} while (nr_left > 0);
>  }
>  
> +static __rte_always_inline void
> +write_back_completed_descs_packed(struct vhost_virtqueue *vq,
> +				uint16_t n_buffers)
> +{
> +	uint16_t nr_left = n_buffers;
> +	uint16_t from, to;
> +
> +	do {
> +		from = vq->last_async_buffer_idx &
> +						(vq->size - 1);
> +		to = (from + nr_left) & (vq->size - 1);

The size of the ring is not necessarily a power of two with packed ring.

> +		if (to > from) {
> +			vhost_update_used_packed(vq,
> +				vq->async_buffers_packed + from,
> +				to - from);
> +			vq->last_async_buffer_idx += nr_left;
> +			nr_left = 0;
> +		} else {
> +			vhost_update_used_packed(vq,
> +				vq->async_buffers_packed + from,
> +				vq->size - from);
> +			vq->last_async_buffer_idx +=
> +						vq->size - from;
> +			nr_left -= vq->size - from;
> +		}
> +	} while (nr_left > 0);
> +}
> +
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count)
>  {
>  	struct virtio_net *dev = get_device(vid);
>  	struct vhost_virtqueue *vq;
> -	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
>  	uint16_t start_idx, pkts_idx, vq_size;
>  	struct async_inflight_info *pkts_info;
>  	uint16_t from, i;
> @@ -1740,21 +2123,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		goto done;
>  	}
>  
> -	for (i = 0; i < n_pkts_put; i++) {
> -		from = (start_idx + i) & (vq_size - 1);
> -		n_descs += pkts_info[from].descs;
> -		pkts[i] = pkts_info[from].mbuf;
> +	if (vq_is_packed(dev)) {
> +		for (i = 0; i < n_pkts_put; i++) {
> +			from = (start_idx + i) & (vq_size - 1);

Unlike split ring, packed ring size is not necessarily a power of 2.

> +			n_buffers += pkts_info[from].nr_buffers;
> +			pkts[i] = pkts_info[from].mbuf;
> +		}
> +	} else {
> +		for (i = 0; i < n_pkts_put; i++) {
> +			from = (start_idx + i) & (vq_size - 1);
> +			n_descs += pkts_info[from].descs;
> +			pkts[i] = pkts_info[from].mbuf;
> +		}
>  	}
> +
>  	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
>  	vq->async_pkts_inflight_n -= n_pkts_put;
>  
>  	if (likely(vq->enabled && vq->access_ok)) {
> -		write_back_completed_descs_split(vq, n_descs);
> +		if (vq_is_packed(dev)) {
> +			write_back_completed_descs_packed(vq, n_buffers);
>  
> -		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
> -		vhost_vring_call_split(dev, vq);
> -	} else
> -		vq->last_async_desc_idx += n_descs;
> +			vhost_vring_call_packed(dev, vq);
> +		} else {
> +			write_back_completed_descs_split(vq, n_descs);
> +
> +			__atomic_add_fetch(&vq->used->idx, n_descs,
> +					__ATOMIC_RELEASE);
> +			vhost_vring_call_split(dev, vq);
> +		}
> +	} else {
> +		if (vq_is_packed(dev))
> +			vq->last_async_buffer_idx += n_buffers;
> +		else
> +			vq->last_async_desc_idx += n_descs;
> +	}
>  
>  done:
>  	rte_spinlock_unlock(&vq->access_lock);
> @@ -1795,9 +2198,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
>  	if (count == 0)
>  		goto out;
>  
> -	/* TODO: packed queue not implemented */
>  	if (vq_is_packed(dev))
> -		nb_tx = 0;
> +		nb_tx = virtio_dev_rx_async_submit_packed(dev,
> +				vq, queue_id, pkts, count, comp_pkts,
> +				comp_count);
>  	else
>  		nb_tx = virtio_dev_rx_async_submit_split(dev,
>  				vq, queue_id, pkts, count, comp_pkts,
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code
  2021-04-13  7:11     ` Maxime Coquelin
@ 2021-04-13  9:06       ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-13  9:06 UTC (permalink / raw)
  To: Maxime Coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 13, 2021 3:12 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>
> Subject: Re: [PATCH v5 1/4] vhost: abstract and reorganize async split ring
> code
> 
> Hi Cheng,
> 
> On 4/12/21 1:34 PM, Cheng Jiang wrote:
> > In order to improve code efficiency and readability when async packed
> > ring support is enabled. This patch abstract some functions like
> > shadow_ring_store and write_back_completed_descs_split. And improve
> > the efficiency of some pointer offset calculation.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> >  lib/librte_vhost/virtio_net.c | 146
> > +++++++++++++++++++---------------
> >  1 file changed, 84 insertions(+), 62 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index ff3987860..c43ab0093 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1458,6 +1458,29 @@ virtio_dev_rx_async_get_info_idx(uint16_t
> pkts_idx,
> >  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);  }
> >
> > +static __rte_always_inline void
> > +shadow_ring_store(struct vhost_virtqueue *vq,  void *shadow_ring, void
> *d_ring,
> > +		uint16_t s_idx, uint16_t d_idx,
> > +		uint16_t count, uint16_t elem_size) {
> > +	if (d_idx + count <= vq->size) {
> > +		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
> > +			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
> > +			count * elem_size);
> > +	} else {
> > +		uint16_t size = vq->size - d_idx;
> > +
> > +		rte_memcpy((void *)((uintptr_t)d_ring + d_idx * elem_size),
> > +			(void *)((uintptr_t)shadow_ring + s_idx * elem_size),
> > +			size * elem_size);
> > +
> > +		rte_memcpy((void *)((uintptr_t)d_ring),
> > +			(void *)((uintptr_t)shadow_ring +
> > +				(s_idx + size) * elem_size),
> > +			(count - size) * elem_size);
> > +	}
> > +}
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >  	struct vhost_virtqueue *vq, uint16_t queue_id, @@ -1478,6 +1501,7
> @@
> > virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >  	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> >  	uint16_t slot_idx = 0;
> >  	uint16_t segs_await = 0;
> > +	uint16_t iovec_idx = 0, it_idx = 0;
> >  	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> >  	uint32_t n_pkts = 0, pkt_err = 0;
> >  	uint32_t num_async_pkts = 0, num_done_pkts = 0; @@ -1513,27
> +1537,32
> > @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >
> >  		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
> >  				buf_vec, nr_vec, num_buffers,
> > -				src_iovec, dst_iovec, src_it, dst_it) < 0) {
> > +				&src_iovec[iovec_idx],
> > +				&dst_iovec[iovec_idx],
> > +				&src_it[it_idx],
> > +				&dst_it[it_idx]) < 0) {
> >  			vq->shadow_used_idx -= num_buffers;
> >  			break;
> >  		}
> >
> >  		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
> >  			(vq->size - 1);
> > -		if (src_it->count) {
> > +		if (src_it[it_idx].count) {
> >  			uint16_t from, to;
> >
> > -			async_fill_desc(&tdes[pkt_burst_idx++], src_it,
> dst_it);
> > +			async_fill_desc(&tdes[pkt_burst_idx++],
> > +				&src_it[it_idx],
> > +				&dst_it[it_idx]);
> >  			pkts_info[slot_idx].descs = num_buffers;
> >  			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> >  			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
> >  			async_pkts_log[num_async_pkts++].last_avail_idx =
> >  				vq->last_avail_idx;
> > -			src_iovec += src_it->nr_segs;
> > -			dst_iovec += dst_it->nr_segs;
> > -			src_it += 2;
> > -			dst_it += 2;
> > -			segs_await += src_it->nr_segs;
> > +
> > +			iovec_idx += src_it[it_idx].nr_segs;
> > +			it_idx += 2;
> > +
> > +			segs_await += src_it[it_idx].nr_segs;
> >
> >  			/**
> >  			 * recover shadow used ring and keep DMA-occupied
> @@ -1541,23
> > +1570,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >  			 */
> >  			from = vq->shadow_used_idx - num_buffers;
> >  			to = vq->async_desc_idx & (vq->size - 1);
> > -			if (num_buffers + to <= vq->size) {
> > -				rte_memcpy(&vq->async_descs_split[to],
> > -						&vq-
> >shadow_used_split[from],
> > -						num_buffers *
> > -						sizeof(struct
> vring_used_elem));
> > -			} else {
> > -				int size = vq->size - to;
> > -
> > -				rte_memcpy(&vq->async_descs_split[to],
> > -						&vq-
> >shadow_used_split[from],
> > -						size *
> > -						sizeof(struct
> vring_used_elem));
> > -				rte_memcpy(vq->async_descs_split,
> > -						&vq-
> >shadow_used_split[from +
> > -						size], (num_buffers - size) *
> > -					   sizeof(struct vring_used_elem));
> > -			}
> > +
> > +			shadow_ring_store(vq, vq->shadow_used_split,
> > +					vq->async_descs_split,
> > +					from, to, num_buffers,
> > +					sizeof(struct vring_used_elem));
> > +
> 
> I'm not convinced with this rework.
> 
> I think it is good to create a dedicated function for this to simplify this huge
> virtio_dev_rx_async_submit_split() function. But we should have a
> dedicated version for split ring. Having a single function for both split and
> packed ring does not improve readability, and unlikely improve performance.

Sure, I'm agree with you. I will use two functions for split and packed separately in the next version.

> 
> >  			vq->async_desc_idx += num_buffers;
> >  			vq->shadow_used_idx -= num_buffers;
> >  		} else
> > @@ -1575,10 +1593,9 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> >  			BUF_VECTOR_MAX))) {
> >  			n_pkts = vq->async_ops.transfer_data(dev->vid,
> >  					queue_id, tdes, 0, pkt_burst_idx);
> > -			src_iovec = vec_pool;
> > -			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> > -			src_it = it_pool;
> > -			dst_it = it_pool + 1;
> > +			iovec_idx = 0;
> > +			it_idx = 0;
> > +
> >  			segs_await = 0;
> >  			vq->async_pkts_inflight_n += n_pkts;
> >
> > @@ -1639,6 +1656,43 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> >  	return pkt_idx;
> >  }
> >
> > +static __rte_always_inline void
> > +write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
> > +n_descs) {
> > +	uint16_t nr_left = n_descs;
> > +	uint16_t nr_copy;
> > +	uint16_t to, from;
> > +
> > +	do {
> > +		from = vq->last_async_desc_idx & (vq->size - 1);
> > +		nr_copy = nr_left + from <= vq->size ? nr_left :
> > +			vq->size - from;
> > +		to = vq->last_used_idx & (vq->size - 1);
> > +
> > +		if (to + nr_copy <= vq->size) {
> > +			rte_memcpy(&vq->used->ring[to],
> > +					&vq->async_descs_split[from],
> > +					nr_copy *
> > +					sizeof(struct vring_used_elem));
> > +		} else {
> > +			uint16_t size = vq->size - to;
> > +
> > +			rte_memcpy(&vq->used->ring[to],
> > +					&vq->async_descs_split[from],
> > +					size *
> > +					sizeof(struct vring_used_elem));
> > +			rte_memcpy(vq->used->ring,
> &vq->used->ring[0] for consistency

It will be fixed in the next version.

> > +					&vq->async_descs_split[from +
> > +					size], (nr_copy - size) *
> > +					sizeof(struct vring_used_elem));
> 
> Lines can now be up to 100 chars.
> Please take the opportunity to indent properly not to have parts of each args
> being put on the same line. It will help readability.

Ok, glad to know. I will fix them in the next version.

Thanks a lot.
Cheng

> 
> > +		}
> > +
> > +		vq->last_async_desc_idx += nr_copy;
> > +		vq->last_used_idx += nr_copy;
> > +		nr_left -= nr_copy;
> > +	} while (nr_left > 0);
> > +}
> > +
> >  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >  		struct rte_mbuf **pkts, uint16_t count)  { @@ -1695,39
> +1749,7 @@
> > uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >  	vq->async_pkts_inflight_n -= n_pkts_put;
> >
> >  	if (likely(vq->enabled && vq->access_ok)) {
> > -		uint16_t nr_left = n_descs;
> > -		uint16_t nr_copy;
> > -		uint16_t to;
> > -
> > -		/* write back completed descriptors to used ring */
> > -		do {
> > -			from = vq->last_async_desc_idx & (vq->size - 1);
> > -			nr_copy = nr_left + from <= vq->size ? nr_left :
> > -				vq->size - from;
> > -			to = vq->last_used_idx & (vq->size - 1);
> > -
> > -			if (to + nr_copy <= vq->size) {
> > -				rte_memcpy(&vq->used->ring[to],
> > -						&vq-
> >async_descs_split[from],
> > -						nr_copy *
> > -						sizeof(struct
> vring_used_elem));
> > -			} else {
> > -				uint16_t size = vq->size - to;
> > -
> > -				rte_memcpy(&vq->used->ring[to],
> > -						&vq-
> >async_descs_split[from],
> > -						size *
> > -						sizeof(struct
> vring_used_elem));
> > -				rte_memcpy(vq->used->ring,
> > -						&vq->async_descs_split[from
> +
> > -						size], (nr_copy - size) *
> > -						sizeof(struct
> vring_used_elem));
> > -			}
> > -
> > -			vq->last_async_desc_idx += nr_copy;
> > -			vq->last_used_idx += nr_copy;
> > -			nr_left -= nr_copy;
> > -		} while (nr_left > 0);
> > +		write_back_completed_descs_split(vq, n_descs);
> >
> >  		__atomic_add_fetch(&vq->used->idx, n_descs,
> __ATOMIC_RELEASE);
> >  		vhost_vring_call_split(dev, vq);
> >


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost
  2021-04-13  8:36     ` Maxime Coquelin
@ 2021-04-13 11:48       ` Jiang, Cheng1
  2021-04-13 13:08         ` Maxime Coquelin
  0 siblings, 1 reply; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-13 11:48 UTC (permalink / raw)
  To: Maxime Coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 13, 2021 4:37 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>
> Subject: Re: [PATCH v5 2/4] vhost: add support for packed ring in async vhost
> 
> 
> 
> On 4/12/21 1:34 PM, Cheng Jiang wrote:
> > For now async vhost data path only supports split ring structure. In
> > order to make async vhost compatible with virtio 1.1 spec this patch
> > enables packed ring in async vhost data path.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> >  lib/librte_vhost/rte_vhost_async.h |   1 +
> >  lib/librte_vhost/vhost.c           |  27 +-
> >  lib/librte_vhost/vhost.h           |   7 +-
> >  lib/librte_vhost/virtio_net.c      | 438 +++++++++++++++++++++++++++--
> >  4 files changed, 448 insertions(+), 25 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > b/lib/librte_vhost/rte_vhost_async.h
> > index c855ff875..6faa31f5a 100644
> > --- a/lib/librte_vhost/rte_vhost_async.h
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
> > async_inflight_info {
> >  	struct rte_mbuf *mbuf;
> >  	uint16_t descs; /* num of descs inflight */
> > +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >  };
> >
> >  /**
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > a70fe01d8..8c9935c0f 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -342,15 +342,21 @@ vhost_free_async_mem(struct vhost_virtqueue
> *vq)
> > {
> >  	if (vq->async_pkts_info)
> >  		rte_free(vq->async_pkts_info);
> > -	if (vq->async_descs_split)
> > +	if (vq->async_buffers_packed) {
> > +		rte_free(vq->async_buffers_packed);
> > +		vq->async_buffers_packed = NULL;
> > +	}
> > +	if (vq->async_descs_split) {
> 
> You can remove the check, rte_free is safe with NULL pointers.
> You can do the same for the other ones in this function.

OK, it will be fixed.

> 
> >  		rte_free(vq->async_descs_split);
> > +		vq->async_descs_split = NULL;
> > +	}
> > +
> >  	if (vq->it_pool)
> >  		rte_free(vq->it_pool);
> >  	if (vq->vec_pool)
> >  		rte_free(vq->vec_pool);
> >
> >  	vq->async_pkts_info = NULL;
> > -	vq->async_descs_split = NULL;
> >  	vq->it_pool = NULL;
> >  	vq->vec_pool = NULL;
> >  }
> > @@ -1627,9 +1633,9 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
> >  		return -1;
> >
> >  	/* packed queue is not supported */
> > -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> > +	if (unlikely(!f.async_inorder)) {
> >  		VHOST_LOG_CONFIG(ERR,
> > -			"async copy is not supported on packed queue or
> non-inorder mode "
> > +			"async copy is not supported on non-inorder mode "
> >  			"(vid %d, qid: %d)\n", vid, queue_id);
> >  		return -1;
> >  	}
> > @@ -1667,11 +1673,18 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
> >  	vq->vec_pool = rte_malloc_socket(NULL,
> >  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
> >  			RTE_CACHE_LINE_SIZE, node);
> > -	vq->async_descs_split = rte_malloc_socket(NULL,
> > +	if (vq_is_packed(dev)) {
> > +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> > +			vq->size * sizeof(struct vring_used_elem_packed),
> > +			RTE_CACHE_LINE_SIZE, node);
> > +	} else {
> > +		vq->async_descs_split = rte_malloc_socket(NULL,
> >  			vq->size * sizeof(struct vring_used_elem),
> >  			RTE_CACHE_LINE_SIZE, node);
> > -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> > -		!vq->it_pool || !vq->vec_pool) {
> > +	}
> > +
> > +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
> > +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
> >  		vhost_free_async_mem(vq);
> >  		VHOST_LOG_CONFIG(ERR,
> >  				"async register failed: cannot allocate
> memory for vq data "
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > f628714c2..fe131ae8f 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -201,9 +201,14 @@ struct vhost_virtqueue {
> >  	uint16_t	async_pkts_idx;
> >  	uint16_t	async_pkts_inflight_n;
> >  	uint16_t	async_last_pkts_n;
> > -	struct vring_used_elem  *async_descs_split;
> > +	union {
> > +		struct vring_used_elem  *async_descs_split;
> > +		struct vring_used_elem_packed *async_buffers_packed;
> > +	};
> >  	uint16_t async_desc_idx;
> > +	uint16_t async_packed_buffer_idx;
> 
> Don't dupplicate variable names, async_desc_idx can be reused for packed
> ring. Also, they are representing the same thing, why use desc in one case
> and buffer in the other?

The main reason is that the unit of the packed used ring is buffer, which can contain many desc.
I think using desc_idx will cause ambiguity, but if you think that I should reuse the desc_idx, I have no problem with that.

> 
> >  	uint16_t last_async_desc_idx;
> > +	uint16_t last_async_buffer_idx;
> 
> Same remark here.
> 
> >  	/* vq async features */
> >  	bool		async_inorder;
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index c43ab0093..410be9678 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -363,14 +363,14 @@
> > vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue
> *vq,
> > }
> >
> >  static __rte_always_inline void
> > -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > -				   struct vhost_virtqueue *vq,
> > -				   uint32_t len[],
> > -				   uint16_t id[],
> > -				   uint16_t count[],
> > +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> > +				   uint32_t *len,
> > +				   uint16_t *id,
> > +				   uint16_t *count,
> >  				   uint16_t num_buffers)
> >  {
> >  	uint16_t i;
> > +
> >  	for (i = 0; i < num_buffers; i++) {
> >  		/* enqueue shadow flush action aligned with batch num */
> >  		if (!vq->shadow_used_idx)
> > @@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct
> virtio_net *dev,
> >  		vq->shadow_aligned_idx += count[i];
> >  		vq->shadow_used_idx++;
> >  	}
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > +				   struct vhost_virtqueue *vq,
> > +				   uint32_t *len,
> > +				   uint16_t *id,
> > +				   uint16_t *count,
> > +				   uint16_t num_buffers)
> > +{
> > +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> >
> >  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
> >  		do_data_copy_enqueue(dev, vq);
> > @@ -1481,6 +1492,62 @@ shadow_ring_store(struct vhost_virtqueue *vq,
> void *shadow_ring, void *d_ring,
> >  	}
> >  }
> >
> > +static __rte_always_inline void
> > +vhost_update_used_packed(struct vhost_virtqueue *vq,
> > +			struct vring_used_elem_packed *shadow_ring,
> > +			uint16_t count)
> > +{
> > +	if (count == 0)
> > +		return;
> 
> Move this after the variables declaration.

Sure.

> 
> > +
> > +	int i;
> > +	uint16_t used_idx = vq->last_used_idx;
> > +	uint16_t head_idx = vq->last_used_idx;
> > +	uint16_t head_flags = 0;
> > +
> > +	/* Split loop in two to save memory barriers */
> > +	for (i = 0; i < count; i++) {
> > +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> > +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> > +
> > +		used_idx += shadow_ring[i].count;
> > +		if (used_idx >= vq->size)
> > +			used_idx -= vq->size;
> > +	}
> > +
> > +	/* The ordering for storing desc flags needs to be enforced. */
> > +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> > +
> > +	for (i = 0; i < count; i++) {
> > +		uint16_t flags;
> > +
> > +		if (vq->shadow_used_packed[i].len)
> > +			flags = VRING_DESC_F_WRITE;
> > +		else
> > +			flags = 0;
> > +
> > +		if (vq->used_wrap_counter) {
> > +			flags |= VRING_DESC_F_USED;
> > +			flags |= VRING_DESC_F_AVAIL;
> > +		} else {
> > +			flags &= ~VRING_DESC_F_USED;
> > +			flags &= ~VRING_DESC_F_AVAIL;
> > +		}
> > +
> > +		if (i > 0) {
> > +			vq->desc_packed[vq->last_used_idx].flags = flags;
> > +
> > +		} else {
> > +			head_idx = vq->last_used_idx;
> > +			head_flags = flags;
> > +		}
> > +
> > +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> > +	}
> > +
> > +	vq->desc_packed[head_idx].flags = head_flags; }
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >  	struct vhost_virtqueue *vq, uint16_t queue_id, @@ -1656,6
> +1723,294
> > @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >  	return pkt_idx;
> >  }
> >
> > +static __rte_always_inline int
> > +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> > +			    struct vhost_virtqueue *vq,
> > +			    struct rte_mbuf *pkt,
> > +			    struct buf_vector *buf_vec,
> > +			    uint16_t *nr_descs,
> > +			    uint16_t *nr_buffers,
> > +			    struct vring_packed_desc *async_descs,
> > +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +			    struct rte_vhost_iov_iter *src_it,
> > +			    struct rte_vhost_iov_iter *dst_it) {
> > +	uint16_t nr_vec = 0;
> > +	uint16_t avail_idx = vq->last_avail_idx;
> > +	uint16_t max_tries, tries = 0;
> > +	uint16_t buf_id = 0;
> > +	uint32_t len = 0;
> > +	uint16_t desc_count = 0;
> > +	uint32_t size = pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> > +	uint32_t buffer_len[vq->size];
> > +	uint16_t buffer_buf_id[vq->size];
> > +	uint16_t buffer_desc_count[vq->size];
> > +	*nr_buffers = 0;
> > +
> > +	if (rxvq_is_mergeable(dev))
> > +		max_tries = vq->size - 1;
> > +	else
> > +		max_tries = 1;
> > +
> > +	while (size > 0) {
> > +		/*
> > +		 * if we tried all available ring items, and still
> > +		 * can't get enough buf, it means something abnormal
> > +		 * happened.
> > +		 */
> > +		if (unlikely(++tries > max_tries))
> > +			return -1;
> > +
> > +		if (unlikely(fill_vec_buf_packed(dev, vq,
> > +						avail_idx, &desc_count,
> > +						buf_vec, &nr_vec,
> > +						&buf_id, &len,
> > +						VHOST_ACCESS_RW) < 0))
> > +			return -1;
> > +
> > +		len = RTE_MIN(len, size);
> > +		size -= len;
> > +
> > +		buffer_len[*nr_buffers] = len;
> > +		buffer_buf_id[*nr_buffers] = buf_id;
> > +		buffer_desc_count[*nr_buffers] = desc_count;
> > +		*nr_buffers += 1;
> > +
> > +		*nr_descs += desc_count;
> > +		avail_idx += desc_count;
> > +		if (avail_idx >= vq->size)
> > +			avail_idx -= vq->size;
> > +	}
> > +
> > +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> > +		src_iovec, dst_iovec, src_it, dst_it) < 0)
> > +		return -1;
> > +	/* store descriptors for DMA */
> > +	if (avail_idx >= *nr_descs)
> > +		rte_memcpy(async_descs,
> > +			&vq->desc_packed[vq->last_avail_idx],
> > +			*nr_descs * sizeof(struct vring_packed_desc));
> 
> Please add brackets for the 'if' since there are for the 'else'.

Sure, sorry for that.

> 
> > +	else {
> > +		uint16_t nr_copy = vq->size - vq->last_avail_idx;
> > +		rte_memcpy(async_descs,
> > +			&vq->desc_packed[vq->last_avail_idx],
> > +			nr_copy * sizeof(struct vring_packed_desc));
> > +		rte_memcpy(async_descs + nr_copy,
> > +			vq->desc_packed, (*nr_descs - nr_copy) *
> > +			sizeof(struct vring_packed_desc));
> > +	}
> > +
> > +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> > +					   buffer_desc_count, *nr_buffers);
> > +
> > +	return 0;
> > +}
> > +
> > +static __rte_always_inline int16_t
> > +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> > +			    struct vhost_virtqueue *vq,
> > +			    struct rte_mbuf *pkt,
> > +			    uint16_t *nr_descs, uint16_t *nr_buffers,
> > +			    struct vring_packed_desc *async_descs,
> > +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +			    struct rte_vhost_iov_iter *src_it,
> > +			    struct rte_vhost_iov_iter *dst_it) {
> > +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> > +	*nr_descs = 0;
> > +	*nr_buffers = 0;
> > +
> > +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> buf_vec,
> > +						 nr_descs,
> > +						 nr_buffers,
> > +						 async_descs,
> > +						 src_iovec, dst_iovec,
> > +						 src_it, dst_it) < 0)) {
> > +		VHOST_LOG_DATA(DEBUG,
> > +				"(%d) failed to get enough desc from vring\n",
> > +				dev->vid);
> > +		return -1;
> > +	}
> > +
> > +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> > +			dev->vid, vq->last_avail_idx,
> > +			vq->last_avail_idx + *nr_descs);
> > +
> > +	return 0;
> > +}
> > +
> > +static __rte_noinline uint32_t
> > +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
> > +	struct vhost_virtqueue *vq, uint16_t queue_id,
> > +	struct rte_mbuf **pkts, uint32_t count,
> > +	struct rte_mbuf **comp_pkts, uint32_t *comp_count) {
> > +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> > +	uint16_t async_descs_idx = 0;
> > +	uint16_t num_buffers;
> > +	uint16_t num_desc;
> > +
> > +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> > +	struct iovec *vec_pool = vq->vec_pool;
> > +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> > +	struct iovec *src_iovec = vec_pool;
> > +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> > +	struct rte_vhost_iov_iter *src_it = it_pool;
> > +	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> > +	uint16_t slot_idx = 0;
> > +	uint16_t segs_await = 0;
> > +	uint16_t iovec_idx = 0, it_idx = 0;
> > +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > +	uint32_t n_pkts = 0, pkt_err = 0;
> > +	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> > +	struct vring_packed_desc async_descs[vq->size];
> > +
> > +	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size -
> > +1)]);
> 
> The size of the ring is not necessarily a power of two with packed ring.

For the size of the ring is not necessarily a power of two,
so maybe I can use codes like 
Indx % vq->size  ?
I'm not sure if it's a good way to do that.

> 
> > +
> > +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> > +		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
> > +						pkts[pkt_idx],
> > +						&num_desc, &num_buffers,
> > +
> 	&async_descs[async_descs_idx],
> > +						&src_iovec[iovec_idx],
> > +						&dst_iovec[iovec_idx],
> > +						&src_it[it_idx],
> > +						&dst_it[it_idx]) < 0))
> > +			break;
> > +
> > +		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> > +			dev->vid, vq->last_avail_idx,
> > +			vq->last_avail_idx + num_desc);
> > +
> > +		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
> > +			(vq->size - 1);
> 
> Same here.

Sure.

> 
> > +		if (src_it[it_idx].count) {
> > +			uint16_t from, to;
> > +
> > +			async_descs_idx += num_desc;
> > +			async_fill_desc(&tdes[pkt_burst_idx++],
> &src_it[it_idx],
> > +					&dst_it[it_idx]);
> > +			pkts_info[slot_idx].descs = num_desc;
> > +			pkts_info[slot_idx].nr_buffers = num_buffers;
> > +			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> > +			num_async_pkts++;
> > +			iovec_idx += src_it[it_idx].nr_segs;
> > +			it_idx += 2;
> > +
> > +			segs_await += src_it[it_idx].nr_segs;
> > +
> > +			/**
> > +			 * recover shadow used ring and keep DMA-occupied
> > +			 * descriptors.
> > +			 */
> > +			from = vq->shadow_used_idx - num_buffers;
> > +			to = vq->async_packed_buffer_idx & (vq->size - 1);
> > +			shadow_ring_store(vq, vq->shadow_used_packed,
> > +					vq->async_buffers_packed,
> > +					from, to, num_buffers,
> > +					sizeof(struct
> vring_used_elem_packed));
> > +
> > +			vq->async_packed_buffer_idx += num_buffers;
> > +			vq->shadow_used_idx -= num_buffers;
> > +		} else
> 
> Brackets needed.

Sure.

> 
> > +			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> > +
> > +		vq_inc_last_avail_packed(vq, num_desc);
> > +
> > +		/*
> > +		 * conditions to trigger async device transfer:
> > +		 * - buffered packet number reaches transfer threshold
> > +		 * - unused async iov number is less than max vhost vector
> > +		 */
> > +		if (unlikely(pkt_burst_idx >=
> VHOST_ASYNC_BATCH_THRESHOLD ||
> > +			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> > +			BUF_VECTOR_MAX))) {
> > +			n_pkts = vq->async_ops.transfer_data(dev->vid,
> > +					queue_id, tdes, 0, pkt_burst_idx);
> > +			iovec_idx = 0;
> > +			it_idx = 0;
> > +			segs_await = 0;
> > +			vq->async_pkts_inflight_n += n_pkts;
> > +
> > +			if (unlikely(n_pkts < pkt_burst_idx)) {
> > +				/*
> > +				 * log error packets number here and do
> actual
> > +				 * error processing when applications poll
> > +				 * completion
> > +				 */
> > +				pkt_err = pkt_burst_idx - n_pkts;
> > +				pkt_burst_idx = 0;
> > +				pkt_idx++;
> > +				break;
> > +			}
> > +
> > +			pkt_burst_idx = 0;
> > +		}
> > +	}
> > +
> > +	if (pkt_burst_idx) {
> > +		n_pkts = vq->async_ops.transfer_data(dev->vid,
> > +				queue_id, tdes, 0, pkt_burst_idx);
> > +		vq->async_pkts_inflight_n += n_pkts;
> > +
> > +		if (unlikely(n_pkts < pkt_burst_idx))
> > +			pkt_err = pkt_burst_idx - n_pkts;
> > +	}
> > +
> > +	do_data_copy_enqueue(dev, vq);
> > +
> > +	if (unlikely(pkt_err)) {
> > +		uint16_t descs_err = 0;
> > +		uint16_t buffers_err = 0;
> > +
> > +		num_async_pkts -= pkt_err;
> > +		pkt_idx -= pkt_err;
> > +	/* calculate the sum of buffers and descs of DMA-error packets. */
> > +		while (pkt_err-- > 0) {
> > +			descs_err +=
> > +				pkts_info[slot_idx & (vq->size - 1)].descs;
> 
> The size of the ring is not necessarily a power of two with packed ring.

Will be fixed.

> 
> > +			buffers_err +=
> > +				pkts_info[slot_idx & (vq->size -
> 1)].nr_buffers;
> 
> Ditto.

Will be fixed.

> 
> > +			slot_idx--;
> > +		}
> > +
> > +		vq->async_packed_buffer_idx -= buffers_err;
> > +
> > +		if (vq->last_avail_idx >= descs_err) {
> > +			vq->last_avail_idx -= descs_err;
> > +
> > +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> > +				&async_descs[async_descs_idx - descs_err],
> > +				descs_err * sizeof(struct
> vring_packed_desc));
> > +		} else {
> > +			uint16_t nr_copy;
> > +
> > +			vq->last_avail_idx = vq->last_avail_idx + vq->size
> > +						- descs_err;
> > +			nr_copy = vq->size - vq->last_avail_idx;
> > +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> > +				&async_descs[async_descs_idx - descs_err],
> > +				nr_copy * sizeof(struct vring_packed_desc));
> > +			descs_err -= nr_copy;
> > +			rte_memcpy(vq->desc_packed,
> > +				&async_descs[async_descs_idx - descs_err],
> > +				descs_err * sizeof(struct
> vring_packed_desc));
> > +			vq->avail_wrap_counter ^= 1;
> > +		}
> > +
> > +		num_done_pkts = pkt_idx - num_async_pkts;
> > +	}
> 
> This error handling could be moved in a dedicated function.

Sure, will fix it in the next version.

> 
> > +	vq->async_pkts_idx += num_async_pkts;
> > +	*comp_count = num_done_pkts;
> > +
> > +	if (likely(vq->shadow_used_idx)) {
> > +		vhost_flush_enqueue_shadow_packed(dev, vq);
> > +		vhost_vring_call_packed(dev, vq);
> > +	}
> > +
> > +	return pkt_idx;
> > +}
> 
> Above function is very big and complex, it should be possible to split it in
> several ones to make it maintainable.

I think move the error handling code will make it smaller.

Thanks.
Cheng

> 
> > +
> >  static __rte_always_inline void
> >  write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
> > n_descs)  { @@ -1693,12 +2048,40 @@
> > write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
> n_descs)
> >  	} while (nr_left > 0);
> >  }
> >
> > +static __rte_always_inline void
> > +write_back_completed_descs_packed(struct vhost_virtqueue *vq,
> > +				uint16_t n_buffers)
> > +{
> > +	uint16_t nr_left = n_buffers;
> > +	uint16_t from, to;
> > +
> > +	do {
> > +		from = vq->last_async_buffer_idx &
> > +						(vq->size - 1);
> > +		to = (from + nr_left) & (vq->size - 1);
> 
> The size of the ring is not necessarily a power of two with packed ring.

Sure.

> 
> > +		if (to > from) {
> > +			vhost_update_used_packed(vq,
> > +				vq->async_buffers_packed + from,
> > +				to - from);
> > +			vq->last_async_buffer_idx += nr_left;
> > +			nr_left = 0;
> > +		} else {
> > +			vhost_update_used_packed(vq,
> > +				vq->async_buffers_packed + from,
> > +				vq->size - from);
> > +			vq->last_async_buffer_idx +=
> > +						vq->size - from;
> > +			nr_left -= vq->size - from;
> > +		}
> > +	} while (nr_left > 0);
> > +}
> > +
> >  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >  		struct rte_mbuf **pkts, uint16_t count)  {
> >  	struct virtio_net *dev = get_device(vid);
> >  	struct vhost_virtqueue *vq;
> > -	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
> > +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
> >  	uint16_t start_idx, pkts_idx, vq_size;
> >  	struct async_inflight_info *pkts_info;
> >  	uint16_t from, i;
> > @@ -1740,21 +2123,41 @@ uint16_t
> rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >  		goto done;
> >  	}
> >
> > -	for (i = 0; i < n_pkts_put; i++) {
> > -		from = (start_idx + i) & (vq_size - 1);
> > -		n_descs += pkts_info[from].descs;
> > -		pkts[i] = pkts_info[from].mbuf;
> > +	if (vq_is_packed(dev)) {
> > +		for (i = 0; i < n_pkts_put; i++) {
> > +			from = (start_idx + i) & (vq_size - 1);
> 
> Unlike split ring, packed ring size is not necessarily a power of 2.

Sure.
Thanks.

> 
> > +			n_buffers += pkts_info[from].nr_buffers;
> > +			pkts[i] = pkts_info[from].mbuf;
> > +		}
> > +	} else {
> > +		for (i = 0; i < n_pkts_put; i++) {
> > +			from = (start_idx + i) & (vq_size - 1);
> > +			n_descs += pkts_info[from].descs;
> > +			pkts[i] = pkts_info[from].mbuf;
> > +		}
> >  	}
> > +
> >  	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
> >  	vq->async_pkts_inflight_n -= n_pkts_put;
> >
> >  	if (likely(vq->enabled && vq->access_ok)) {
> > -		write_back_completed_descs_split(vq, n_descs);
> > +		if (vq_is_packed(dev)) {
> > +			write_back_completed_descs_packed(vq,
> n_buffers);
> >
> > -		__atomic_add_fetch(&vq->used->idx, n_descs,
> __ATOMIC_RELEASE);
> > -		vhost_vring_call_split(dev, vq);
> > -	} else
> > -		vq->last_async_desc_idx += n_descs;
> > +			vhost_vring_call_packed(dev, vq);
> > +		} else {
> > +			write_back_completed_descs_split(vq, n_descs);
> > +
> > +			__atomic_add_fetch(&vq->used->idx, n_descs,
> > +					__ATOMIC_RELEASE);
> > +			vhost_vring_call_split(dev, vq);
> > +		}
> > +	} else {
> > +		if (vq_is_packed(dev))
> > +			vq->last_async_buffer_idx += n_buffers;
> > +		else
> > +			vq->last_async_desc_idx += n_descs;
> > +	}
> >
> >  done:
> >  	rte_spinlock_unlock(&vq->access_lock);
> > @@ -1795,9 +2198,10 @@ virtio_dev_rx_async_submit(struct virtio_net
> *dev, uint16_t queue_id,
> >  	if (count == 0)
> >  		goto out;
> >
> > -	/* TODO: packed queue not implemented */
> >  	if (vq_is_packed(dev))
> > -		nb_tx = 0;
> > +		nb_tx = virtio_dev_rx_async_submit_packed(dev,
> > +				vq, queue_id, pkts, count, comp_pkts,
> > +				comp_count);
> >  	else
> >  		nb_tx = virtio_dev_rx_async_submit_split(dev,
> >  				vq, queue_id, pkts, count, comp_pkts,
> >


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost
  2021-04-13 11:48       ` Jiang, Cheng1
@ 2021-04-13 13:08         ` Maxime Coquelin
  2021-04-13 13:50           ` Jiang, Cheng1
  0 siblings, 1 reply; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-13 13:08 UTC (permalink / raw)
  To: Jiang, Cheng1, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan, Liu, Yong



On 4/13/21 1:48 PM, Jiang, Cheng1 wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Tuesday, April 13, 2021 4:37 PM
>> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Xia, Chenbo
>> <chenbo.xia@intel.com>
>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
>> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
>> Yong <yong.liu@intel.com>
>> Subject: Re: [PATCH v5 2/4] vhost: add support for packed ring in async vhost
>>
>>
>>
>> On 4/12/21 1:34 PM, Cheng Jiang wrote:
>>> For now async vhost data path only supports split ring structure. In
>>> order to make async vhost compatible with virtio 1.1 spec this patch
>>> enables packed ring in async vhost data path.
>>>
>>> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
>>> ---
>>>  lib/librte_vhost/rte_vhost_async.h |   1 +
>>>  lib/librte_vhost/vhost.c           |  27 +-
>>>  lib/librte_vhost/vhost.h           |   7 +-
>>>  lib/librte_vhost/virtio_net.c      | 438 +++++++++++++++++++++++++++--
>>>  4 files changed, 448 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/lib/librte_vhost/rte_vhost_async.h
>>> b/lib/librte_vhost/rte_vhost_async.h
>>> index c855ff875..6faa31f5a 100644
>>> --- a/lib/librte_vhost/rte_vhost_async.h
>>> +++ b/lib/librte_vhost/rte_vhost_async.h
>>> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
>>> async_inflight_info {
>>>  	struct rte_mbuf *mbuf;
>>>  	uint16_t descs; /* num of descs inflight */
>>> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>>>  };
>>>
>>>  /**
>>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
>>> a70fe01d8..8c9935c0f 100644
>>> --- a/lib/librte_vhost/vhost.c
>>> +++ b/lib/librte_vhost/vhost.c
>>> @@ -342,15 +342,21 @@ vhost_free_async_mem(struct vhost_virtqueue
>> *vq)
>>> {
>>>  	if (vq->async_pkts_info)
>>>  		rte_free(vq->async_pkts_info);
>>> -	if (vq->async_descs_split)
>>> +	if (vq->async_buffers_packed) {
>>> +		rte_free(vq->async_buffers_packed);
>>> +		vq->async_buffers_packed = NULL;
>>> +	}
>>> +	if (vq->async_descs_split) {
>>
>> You can remove the check, rte_free is safe with NULL pointers.
>> You can do the same for the other ones in this function.
> 
> OK, it will be fixed.
> 
>>
>>>  		rte_free(vq->async_descs_split);
>>> +		vq->async_descs_split = NULL;
>>> +	}
>>> +
>>>  	if (vq->it_pool)
>>>  		rte_free(vq->it_pool);
>>>  	if (vq->vec_pool)
>>>  		rte_free(vq->vec_pool);
>>>
>>>  	vq->async_pkts_info = NULL;
>>> -	vq->async_descs_split = NULL;
>>>  	vq->it_pool = NULL;
>>>  	vq->vec_pool = NULL;
>>>  }
>>> @@ -1627,9 +1633,9 @@ int rte_vhost_async_channel_register(int vid,
>> uint16_t queue_id,
>>>  		return -1;
>>>
>>>  	/* packed queue is not supported */
>>> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
>>> +	if (unlikely(!f.async_inorder)) {
>>>  		VHOST_LOG_CONFIG(ERR,
>>> -			"async copy is not supported on packed queue or
>> non-inorder mode "
>>> +			"async copy is not supported on non-inorder mode "
>>>  			"(vid %d, qid: %d)\n", vid, queue_id);
>>>  		return -1;
>>>  	}
>>> @@ -1667,11 +1673,18 @@ int rte_vhost_async_channel_register(int vid,
>> uint16_t queue_id,
>>>  	vq->vec_pool = rte_malloc_socket(NULL,
>>>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
>>>  			RTE_CACHE_LINE_SIZE, node);
>>> -	vq->async_descs_split = rte_malloc_socket(NULL,
>>> +	if (vq_is_packed(dev)) {
>>> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
>>> +			vq->size * sizeof(struct vring_used_elem_packed),
>>> +			RTE_CACHE_LINE_SIZE, node);
>>> +	} else {
>>> +		vq->async_descs_split = rte_malloc_socket(NULL,
>>>  			vq->size * sizeof(struct vring_used_elem),
>>>  			RTE_CACHE_LINE_SIZE, node);
>>> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
>>> -		!vq->it_pool || !vq->vec_pool) {
>>> +	}
>>> +
>>> +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
>>> +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
>>>  		vhost_free_async_mem(vq);
>>>  		VHOST_LOG_CONFIG(ERR,
>>>  				"async register failed: cannot allocate
>> memory for vq data "
>>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
>>> f628714c2..fe131ae8f 100644
>>> --- a/lib/librte_vhost/vhost.h
>>> +++ b/lib/librte_vhost/vhost.h
>>> @@ -201,9 +201,14 @@ struct vhost_virtqueue {
>>>  	uint16_t	async_pkts_idx;
>>>  	uint16_t	async_pkts_inflight_n;
>>>  	uint16_t	async_last_pkts_n;
>>> -	struct vring_used_elem  *async_descs_split;
>>> +	union {
>>> +		struct vring_used_elem  *async_descs_split;
>>> +		struct vring_used_elem_packed *async_buffers_packed;
>>> +	};
>>>  	uint16_t async_desc_idx;
>>> +	uint16_t async_packed_buffer_idx;
>>
>> Don't dupplicate variable names, async_desc_idx can be reused for packed
>> ring. Also, they are representing the same thing, why use desc in one case
>> and buffer in the other?
> 
> The main reason is that the unit of the packed used ring is buffer, which can contain many desc.
> I think using desc_idx will cause ambiguity, but if you think that I should reuse the desc_idx, I have no problem with that.

OK, in this case please use a union not to waste memory.

>>
>>>  	uint16_t last_async_desc_idx;
>>> +	uint16_t last_async_buffer_idx;
>>
>> Same remark here.
>>
>>>  	/* vq async features */
>>>  	bool		async_inorder;
>>> diff --git a/lib/librte_vhost/virtio_net.c
>>> b/lib/librte_vhost/virtio_net.c index c43ab0093..410be9678 100644
>>> --- a/lib/librte_vhost/virtio_net.c
>>> +++ b/lib/librte_vhost/virtio_net.c
>>> @@ -363,14 +363,14 @@
>>> vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue
>> *vq,
>>> }
>>>
>>>  static __rte_always_inline void
>>> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
>>> -				   struct vhost_virtqueue *vq,
>>> -				   uint32_t len[],
>>> -				   uint16_t id[],
>>> -				   uint16_t count[],
>>> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
>>> +				   uint32_t *len,
>>> +				   uint16_t *id,
>>> +				   uint16_t *count,
>>>  				   uint16_t num_buffers)
>>>  {
>>>  	uint16_t i;
>>> +
>>>  	for (i = 0; i < num_buffers; i++) {
>>>  		/* enqueue shadow flush action aligned with batch num */
>>>  		if (!vq->shadow_used_idx)
>>> @@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct
>> virtio_net *dev,
>>>  		vq->shadow_aligned_idx += count[i];
>>>  		vq->shadow_used_idx++;
>>>  	}
>>> +}
>>> +
>>> +static __rte_always_inline void
>>> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
>>> +				   struct vhost_virtqueue *vq,
>>> +				   uint32_t *len,
>>> +				   uint16_t *id,
>>> +				   uint16_t *count,
>>> +				   uint16_t num_buffers)
>>> +{
>>> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
>>>
>>>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
>>>  		do_data_copy_enqueue(dev, vq);
>>> @@ -1481,6 +1492,62 @@ shadow_ring_store(struct vhost_virtqueue *vq,
>> void *shadow_ring, void *d_ring,
>>>  	}
>>>  }
>>>
>>> +static __rte_always_inline void
>>> +vhost_update_used_packed(struct vhost_virtqueue *vq,
>>> +			struct vring_used_elem_packed *shadow_ring,
>>> +			uint16_t count)
>>> +{
>>> +	if (count == 0)
>>> +		return;
>>
>> Move this after the variables declaration.
> 
> Sure.
> 
>>
>>> +
>>> +	int i;
>>> +	uint16_t used_idx = vq->last_used_idx;
>>> +	uint16_t head_idx = vq->last_used_idx;
>>> +	uint16_t head_flags = 0;
>>> +
>>> +	/* Split loop in two to save memory barriers */
>>> +	for (i = 0; i < count; i++) {
>>> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
>>> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
>>> +
>>> +		used_idx += shadow_ring[i].count;
>>> +		if (used_idx >= vq->size)
>>> +			used_idx -= vq->size;
>>> +	}
>>> +
>>> +	/* The ordering for storing desc flags needs to be enforced. */
>>> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
>>> +
>>> +	for (i = 0; i < count; i++) {
>>> +		uint16_t flags;
>>> +
>>> +		if (vq->shadow_used_packed[i].len)
>>> +			flags = VRING_DESC_F_WRITE;
>>> +		else
>>> +			flags = 0;
>>> +
>>> +		if (vq->used_wrap_counter) {
>>> +			flags |= VRING_DESC_F_USED;
>>> +			flags |= VRING_DESC_F_AVAIL;
>>> +		} else {
>>> +			flags &= ~VRING_DESC_F_USED;
>>> +			flags &= ~VRING_DESC_F_AVAIL;
>>> +		}
>>> +
>>> +		if (i > 0) {
>>> +			vq->desc_packed[vq->last_used_idx].flags = flags;
>>> +
>>> +		} else {
>>> +			head_idx = vq->last_used_idx;
>>> +			head_flags = flags;
>>> +		}
>>> +
>>> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
>>> +	}
>>> +
>>> +	vq->desc_packed[head_idx].flags = head_flags; }
>>> +
>>>  static __rte_noinline uint32_t
>>>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>>>  	struct vhost_virtqueue *vq, uint16_t queue_id, @@ -1656,6
>> +1723,294
>>> @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>>>  	return pkt_idx;
>>>  }
>>>
>>> +static __rte_always_inline int
>>> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
>>> +			    struct vhost_virtqueue *vq,
>>> +			    struct rte_mbuf *pkt,
>>> +			    struct buf_vector *buf_vec,
>>> +			    uint16_t *nr_descs,
>>> +			    uint16_t *nr_buffers,
>>> +			    struct vring_packed_desc *async_descs,
>>> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
>>> +			    struct rte_vhost_iov_iter *src_it,
>>> +			    struct rte_vhost_iov_iter *dst_it) {
>>> +	uint16_t nr_vec = 0;
>>> +	uint16_t avail_idx = vq->last_avail_idx;
>>> +	uint16_t max_tries, tries = 0;
>>> +	uint16_t buf_id = 0;
>>> +	uint32_t len = 0;
>>> +	uint16_t desc_count = 0;
>>> +	uint32_t size = pkt->pkt_len + sizeof(struct
>> virtio_net_hdr_mrg_rxbuf);
>>> +	uint32_t buffer_len[vq->size];
>>> +	uint16_t buffer_buf_id[vq->size];
>>> +	uint16_t buffer_desc_count[vq->size];
>>> +	*nr_buffers = 0;
>>> +
>>> +	if (rxvq_is_mergeable(dev))
>>> +		max_tries = vq->size - 1;
>>> +	else
>>> +		max_tries = 1;
>>> +
>>> +	while (size > 0) {
>>> +		/*
>>> +		 * if we tried all available ring items, and still
>>> +		 * can't get enough buf, it means something abnormal
>>> +		 * happened.
>>> +		 */
>>> +		if (unlikely(++tries > max_tries))
>>> +			return -1;
>>> +
>>> +		if (unlikely(fill_vec_buf_packed(dev, vq,
>>> +						avail_idx, &desc_count,
>>> +						buf_vec, &nr_vec,
>>> +						&buf_id, &len,
>>> +						VHOST_ACCESS_RW) < 0))
>>> +			return -1;
>>> +
>>> +		len = RTE_MIN(len, size);
>>> +		size -= len;
>>> +
>>> +		buffer_len[*nr_buffers] = len;
>>> +		buffer_buf_id[*nr_buffers] = buf_id;
>>> +		buffer_desc_count[*nr_buffers] = desc_count;
>>> +		*nr_buffers += 1;
>>> +
>>> +		*nr_descs += desc_count;
>>> +		avail_idx += desc_count;
>>> +		if (avail_idx >= vq->size)
>>> +			avail_idx -= vq->size;
>>> +	}
>>> +
>>> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
>>> +		src_iovec, dst_iovec, src_it, dst_it) < 0)
>>> +		return -1;
>>> +	/* store descriptors for DMA */
>>> +	if (avail_idx >= *nr_descs)
>>> +		rte_memcpy(async_descs,
>>> +			&vq->desc_packed[vq->last_avail_idx],
>>> +			*nr_descs * sizeof(struct vring_packed_desc));
>>
>> Please add brackets for the 'if' since there are for the 'else'.
> 
> Sure, sorry for that.
> 
>>
>>> +	else {
>>> +		uint16_t nr_copy = vq->size - vq->last_avail_idx;
>>> +		rte_memcpy(async_descs,
>>> +			&vq->desc_packed[vq->last_avail_idx],
>>> +			nr_copy * sizeof(struct vring_packed_desc));
>>> +		rte_memcpy(async_descs + nr_copy,
>>> +			vq->desc_packed, (*nr_descs - nr_copy) *
>>> +			sizeof(struct vring_packed_desc));
>>> +	}
>>> +
>>> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
>>> +					   buffer_desc_count, *nr_buffers);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static __rte_always_inline int16_t
>>> +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
>>> +			    struct vhost_virtqueue *vq,
>>> +			    struct rte_mbuf *pkt,
>>> +			    uint16_t *nr_descs, uint16_t *nr_buffers,
>>> +			    struct vring_packed_desc *async_descs,
>>> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
>>> +			    struct rte_vhost_iov_iter *src_it,
>>> +			    struct rte_vhost_iov_iter *dst_it) {
>>> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
>>> +	*nr_descs = 0;
>>> +	*nr_buffers = 0;
>>> +
>>> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
>> buf_vec,
>>> +						 nr_descs,
>>> +						 nr_buffers,
>>> +						 async_descs,
>>> +						 src_iovec, dst_iovec,
>>> +						 src_it, dst_it) < 0)) {
>>> +		VHOST_LOG_DATA(DEBUG,
>>> +				"(%d) failed to get enough desc from vring\n",
>>> +				dev->vid);
>>> +		return -1;
>>> +	}
>>> +
>>> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
>> index %d\n",
>>> +			dev->vid, vq->last_avail_idx,
>>> +			vq->last_avail_idx + *nr_descs);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static __rte_noinline uint32_t
>>> +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
>>> +	struct vhost_virtqueue *vq, uint16_t queue_id,
>>> +	struct rte_mbuf **pkts, uint32_t count,
>>> +	struct rte_mbuf **comp_pkts, uint32_t *comp_count) {
>>> +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
>>> +	uint16_t async_descs_idx = 0;
>>> +	uint16_t num_buffers;
>>> +	uint16_t num_desc;
>>> +
>>> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
>>> +	struct iovec *vec_pool = vq->vec_pool;
>>> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
>>> +	struct iovec *src_iovec = vec_pool;
>>> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
>> 1);
>>> +	struct rte_vhost_iov_iter *src_it = it_pool;
>>> +	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
>>> +	uint16_t slot_idx = 0;
>>> +	uint16_t segs_await = 0;
>>> +	uint16_t iovec_idx = 0, it_idx = 0;
>>> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>>> +	uint32_t n_pkts = 0, pkt_err = 0;
>>> +	uint32_t num_async_pkts = 0, num_done_pkts = 0;
>>> +	struct vring_packed_desc async_descs[vq->size];
>>> +
>>> +	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size -
>>> +1)]);
>>
>> The size of the ring is not necessarily a power of two with packed ring.
> 
> For the size of the ring is not necessarily a power of two,
> so maybe I can use codes like 
> Indx % vq->size  ?
> I'm not sure if it's a good way to do that.

In this case it is OK.

>>
>>> +
>>> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
>>> +		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
>>> +						pkts[pkt_idx],
>>> +						&num_desc, &num_buffers,
>>> +
>> 	&async_descs[async_descs_idx],
>>> +						&src_iovec[iovec_idx],
>>> +						&dst_iovec[iovec_idx],
>>> +						&src_it[it_idx],
>>> +						&dst_it[it_idx]) < 0))
>>> +			break;
>>> +
>>> +		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
>> index %d\n",
>>> +			dev->vid, vq->last_avail_idx,
>>> +			vq->last_avail_idx + num_desc);
>>> +
>>> +		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
>>> +			(vq->size - 1);
>>
>> Same here.
> 
> Sure.
> 
>>
>>> +		if (src_it[it_idx].count) {
>>> +			uint16_t from, to;
>>> +
>>> +			async_descs_idx += num_desc;
>>> +			async_fill_desc(&tdes[pkt_burst_idx++],
>> &src_it[it_idx],
>>> +					&dst_it[it_idx]);
>>> +			pkts_info[slot_idx].descs = num_desc;
>>> +			pkts_info[slot_idx].nr_buffers = num_buffers;
>>> +			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
>>> +			num_async_pkts++;
>>> +			iovec_idx += src_it[it_idx].nr_segs;
>>> +			it_idx += 2;
>>> +
>>> +			segs_await += src_it[it_idx].nr_segs;
>>> +
>>> +			/**
>>> +			 * recover shadow used ring and keep DMA-occupied
>>> +			 * descriptors.
>>> +			 */
>>> +			from = vq->shadow_used_idx - num_buffers;
>>> +			to = vq->async_packed_buffer_idx & (vq->size - 1);
>>> +			shadow_ring_store(vq, vq->shadow_used_packed,
>>> +					vq->async_buffers_packed,
>>> +					from, to, num_buffers,
>>> +					sizeof(struct
>> vring_used_elem_packed));
>>> +
>>> +			vq->async_packed_buffer_idx += num_buffers;
>>> +			vq->shadow_used_idx -= num_buffers;
>>> +		} else
>>
>> Brackets needed.
> 
> Sure.
> 
>>
>>> +			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
>>> +
>>> +		vq_inc_last_avail_packed(vq, num_desc);
>>> +
>>> +		/*
>>> +		 * conditions to trigger async device transfer:
>>> +		 * - buffered packet number reaches transfer threshold
>>> +		 * - unused async iov number is less than max vhost vector
>>> +		 */
>>> +		if (unlikely(pkt_burst_idx >=
>> VHOST_ASYNC_BATCH_THRESHOLD ||
>>> +			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
>>> +			BUF_VECTOR_MAX))) {
>>> +			n_pkts = vq->async_ops.transfer_data(dev->vid,
>>> +					queue_id, tdes, 0, pkt_burst_idx);
>>> +			iovec_idx = 0;
>>> +			it_idx = 0;
>>> +			segs_await = 0;
>>> +			vq->async_pkts_inflight_n += n_pkts;
>>> +
>>> +			if (unlikely(n_pkts < pkt_burst_idx)) {
>>> +				/*
>>> +				 * log error packets number here and do
>> actual
>>> +				 * error processing when applications poll
>>> +				 * completion
>>> +				 */
>>> +				pkt_err = pkt_burst_idx - n_pkts;
>>> +				pkt_burst_idx = 0;
>>> +				pkt_idx++;
>>> +				break;
>>> +			}
>>> +
>>> +			pkt_burst_idx = 0;
>>> +		}
>>> +	}
>>> +
>>> +	if (pkt_burst_idx) {
>>> +		n_pkts = vq->async_ops.transfer_data(dev->vid,
>>> +				queue_id, tdes, 0, pkt_burst_idx);
>>> +		vq->async_pkts_inflight_n += n_pkts;
>>> +
>>> +		if (unlikely(n_pkts < pkt_burst_idx))
>>> +			pkt_err = pkt_burst_idx - n_pkts;
>>> +	}
>>> +
>>> +	do_data_copy_enqueue(dev, vq);
>>> +
>>> +	if (unlikely(pkt_err)) {
>>> +		uint16_t descs_err = 0;
>>> +		uint16_t buffers_err = 0;
>>> +
>>> +		num_async_pkts -= pkt_err;
>>> +		pkt_idx -= pkt_err;
>>> +	/* calculate the sum of buffers and descs of DMA-error packets. */
>>> +		while (pkt_err-- > 0) {
>>> +			descs_err +=
>>> +				pkts_info[slot_idx & (vq->size - 1)].descs;
>>
>> The size of the ring is not necessarily a power of two with packed ring.
> 
> Will be fixed.
> 
>>
>>> +			buffers_err +=
>>> +				pkts_info[slot_idx & (vq->size -
>> 1)].nr_buffers;
>>
>> Ditto.
> 
> Will be fixed.
> 
>>
>>> +			slot_idx--;
>>> +		}
>>> +
>>> +		vq->async_packed_buffer_idx -= buffers_err;
>>> +
>>> +		if (vq->last_avail_idx >= descs_err) {
>>> +			vq->last_avail_idx -= descs_err;
>>> +
>>> +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
>>> +				&async_descs[async_descs_idx - descs_err],
>>> +				descs_err * sizeof(struct
>> vring_packed_desc));
>>> +		} else {
>>> +			uint16_t nr_copy;
>>> +
>>> +			vq->last_avail_idx = vq->last_avail_idx + vq->size
>>> +						- descs_err;
>>> +			nr_copy = vq->size - vq->last_avail_idx;
>>> +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
>>> +				&async_descs[async_descs_idx - descs_err],
>>> +				nr_copy * sizeof(struct vring_packed_desc));
>>> +			descs_err -= nr_copy;
>>> +			rte_memcpy(vq->desc_packed,
>>> +				&async_descs[async_descs_idx - descs_err],
>>> +				descs_err * sizeof(struct
>> vring_packed_desc));
>>> +			vq->avail_wrap_counter ^= 1;
>>> +		}
>>> +
>>> +		num_done_pkts = pkt_idx - num_async_pkts;
>>> +	}
>>
>> This error handling could be moved in a dedicated function.
> 
> Sure, will fix it in the next version.
> 
>>
>>> +	vq->async_pkts_idx += num_async_pkts;
>>> +	*comp_count = num_done_pkts;
>>> +
>>> +	if (likely(vq->shadow_used_idx)) {
>>> +		vhost_flush_enqueue_shadow_packed(dev, vq);
>>> +		vhost_vring_call_packed(dev, vq);
>>> +	}
>>> +
>>> +	return pkt_idx;
>>> +}
>>
>> Above function is very big and complex, it should be possible to split it in
>> several ones to make it maintainable.
> 
> I think move the error handling code will make it smaller.
> 
> Thanks.
> Cheng
> 
>>
>>> +
>>>  static __rte_always_inline void
>>>  write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
>>> n_descs)  { @@ -1693,12 +2048,40 @@
>>> write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
>> n_descs)
>>>  	} while (nr_left > 0);
>>>  }
>>>
>>> +static __rte_always_inline void
>>> +write_back_completed_descs_packed(struct vhost_virtqueue *vq,
>>> +				uint16_t n_buffers)
>>> +{
>>> +	uint16_t nr_left = n_buffers;
>>> +	uint16_t from, to;
>>> +
>>> +	do {
>>> +		from = vq->last_async_buffer_idx &
>>> +						(vq->size - 1);
>>> +		to = (from + nr_left) & (vq->size - 1);
>>
>> The size of the ring is not necessarily a power of two with packed ring.
> 
> Sure.
> 
>>
>>> +		if (to > from) {
>>> +			vhost_update_used_packed(vq,
>>> +				vq->async_buffers_packed + from,
>>> +				to - from);
>>> +			vq->last_async_buffer_idx += nr_left;
>>> +			nr_left = 0;
>>> +		} else {
>>> +			vhost_update_used_packed(vq,
>>> +				vq->async_buffers_packed + from,
>>> +				vq->size - from);
>>> +			vq->last_async_buffer_idx +=
>>> +						vq->size - from;
>>> +			nr_left -= vq->size - from;
>>> +		}
>>> +	} while (nr_left > 0);
>>> +}
>>> +
>>>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>>>  		struct rte_mbuf **pkts, uint16_t count)  {
>>>  	struct virtio_net *dev = get_device(vid);
>>>  	struct vhost_virtqueue *vq;
>>> -	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
>>> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
>>>  	uint16_t start_idx, pkts_idx, vq_size;
>>>  	struct async_inflight_info *pkts_info;
>>>  	uint16_t from, i;
>>> @@ -1740,21 +2123,41 @@ uint16_t
>> rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>>>  		goto done;
>>>  	}
>>>
>>> -	for (i = 0; i < n_pkts_put; i++) {
>>> -		from = (start_idx + i) & (vq_size - 1);
>>> -		n_descs += pkts_info[from].descs;
>>> -		pkts[i] = pkts_info[from].mbuf;
>>> +	if (vq_is_packed(dev)) {
>>> +		for (i = 0; i < n_pkts_put; i++) {
>>> +			from = (start_idx + i) & (vq_size - 1);
>>
>> Unlike split ring, packed ring size is not necessarily a power of 2.
> 
> Sure.
> Thanks.
> 
>>
>>> +			n_buffers += pkts_info[from].nr_buffers;
>>> +			pkts[i] = pkts_info[from].mbuf;
>>> +		}
>>> +	} else {
>>> +		for (i = 0; i < n_pkts_put; i++) {
>>> +			from = (start_idx + i) & (vq_size - 1);
>>> +			n_descs += pkts_info[from].descs;
>>> +			pkts[i] = pkts_info[from].mbuf;
>>> +		}
>>>  	}
>>> +
>>>  	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
>>>  	vq->async_pkts_inflight_n -= n_pkts_put;
>>>
>>>  	if (likely(vq->enabled && vq->access_ok)) {
>>> -		write_back_completed_descs_split(vq, n_descs);
>>> +		if (vq_is_packed(dev)) {
>>> +			write_back_completed_descs_packed(vq,
>> n_buffers);
>>>
>>> -		__atomic_add_fetch(&vq->used->idx, n_descs,
>> __ATOMIC_RELEASE);
>>> -		vhost_vring_call_split(dev, vq);
>>> -	} else
>>> -		vq->last_async_desc_idx += n_descs;
>>> +			vhost_vring_call_packed(dev, vq);
>>> +		} else {
>>> +			write_back_completed_descs_split(vq, n_descs);
>>> +
>>> +			__atomic_add_fetch(&vq->used->idx, n_descs,
>>> +					__ATOMIC_RELEASE);
>>> +			vhost_vring_call_split(dev, vq);
>>> +		}
>>> +	} else {
>>> +		if (vq_is_packed(dev))
>>> +			vq->last_async_buffer_idx += n_buffers;
>>> +		else
>>> +			vq->last_async_desc_idx += n_descs;
>>> +	}
>>>
>>>  done:
>>>  	rte_spinlock_unlock(&vq->access_lock);
>>> @@ -1795,9 +2198,10 @@ virtio_dev_rx_async_submit(struct virtio_net
>> *dev, uint16_t queue_id,
>>>  	if (count == 0)
>>>  		goto out;
>>>
>>> -	/* TODO: packed queue not implemented */
>>>  	if (vq_is_packed(dev))
>>> -		nb_tx = 0;
>>> +		nb_tx = virtio_dev_rx_async_submit_packed(dev,
>>> +				vq, queue_id, pkts, count, comp_pkts,
>>> +				comp_count);
>>>  	else
>>>  		nb_tx = virtio_dev_rx_async_submit_split(dev,
>>>  				vq, queue_id, pkts, count, comp_pkts,
>>>
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost
  2021-04-13 13:08         ` Maxime Coquelin
@ 2021-04-13 13:50           ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-13 13:50 UTC (permalink / raw)
  To: Maxime Coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan, Liu, Yong



> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 13, 2021 9:08 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>
> Subject: Re: [PATCH v5 2/4] vhost: add support for packed ring in async vhost
> 
> 
> 
> On 4/13/21 1:48 PM, Jiang, Cheng1 wrote:
> > Hi Maxime,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Tuesday, April 13, 2021 4:37 PM
> >> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Xia, Chenbo
> >> <chenbo.xia@intel.com>
> >> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> >> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> >> Yong <yong.liu@intel.com>
> >> Subject: Re: [PATCH v5 2/4] vhost: add support for packed ring in
> >> async vhost
> >>
> >>
> >>
> >> On 4/12/21 1:34 PM, Cheng Jiang wrote:
> >>> For now async vhost data path only supports split ring structure. In
> >>> order to make async vhost compatible with virtio 1.1 spec this patch
> >>> enables packed ring in async vhost data path.
> >>>
> >>> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> >>> ---
> >>>  lib/librte_vhost/rte_vhost_async.h |   1 +
> >>>  lib/librte_vhost/vhost.c           |  27 +-
> >>>  lib/librte_vhost/vhost.h           |   7 +-
> >>>  lib/librte_vhost/virtio_net.c      | 438
> +++++++++++++++++++++++++++--
> >>>  4 files changed, 448 insertions(+), 25 deletions(-)
> >>>
> >>> diff --git a/lib/librte_vhost/rte_vhost_async.h
> >>> b/lib/librte_vhost/rte_vhost_async.h
> >>> index c855ff875..6faa31f5a 100644
> >>> --- a/lib/librte_vhost/rte_vhost_async.h
> >>> +++ b/lib/librte_vhost/rte_vhost_async.h
> >>> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
> >>> async_inflight_info {
> >>>  	struct rte_mbuf *mbuf;
> >>>  	uint16_t descs; /* num of descs inflight */
> >>> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >>>  };
> >>>
> >>>  /**
> >>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> >>> index a70fe01d8..8c9935c0f 100644
> >>> --- a/lib/librte_vhost/vhost.c
> >>> +++ b/lib/librte_vhost/vhost.c
> >>> @@ -342,15 +342,21 @@ vhost_free_async_mem(struct
> vhost_virtqueue
> >> *vq)
> >>> {
> >>>  	if (vq->async_pkts_info)
> >>>  		rte_free(vq->async_pkts_info);
> >>> -	if (vq->async_descs_split)
> >>> +	if (vq->async_buffers_packed) {
> >>> +		rte_free(vq->async_buffers_packed);
> >>> +		vq->async_buffers_packed = NULL;
> >>> +	}
> >>> +	if (vq->async_descs_split) {
> >>
> >> You can remove the check, rte_free is safe with NULL pointers.
> >> You can do the same for the other ones in this function.
> >
> > OK, it will be fixed.
> >
> >>
> >>>  		rte_free(vq->async_descs_split);
> >>> +		vq->async_descs_split = NULL;
> >>> +	}
> >>> +
> >>>  	if (vq->it_pool)
> >>>  		rte_free(vq->it_pool);
> >>>  	if (vq->vec_pool)
> >>>  		rte_free(vq->vec_pool);
> >>>
> >>>  	vq->async_pkts_info = NULL;
> >>> -	vq->async_descs_split = NULL;
> >>>  	vq->it_pool = NULL;
> >>>  	vq->vec_pool = NULL;
> >>>  }
> >>> @@ -1627,9 +1633,9 @@ int rte_vhost_async_channel_register(int vid,
> >> uint16_t queue_id,
> >>>  		return -1;
> >>>
> >>>  	/* packed queue is not supported */
> >>> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> >>> +	if (unlikely(!f.async_inorder)) {
> >>>  		VHOST_LOG_CONFIG(ERR,
> >>> -			"async copy is not supported on packed queue or
> >> non-inorder mode "
> >>> +			"async copy is not supported on non-inorder mode "
> >>>  			"(vid %d, qid: %d)\n", vid, queue_id);
> >>>  		return -1;
> >>>  	}
> >>> @@ -1667,11 +1673,18 @@ int rte_vhost_async_channel_register(int
> >>> vid,
> >> uint16_t queue_id,
> >>>  	vq->vec_pool = rte_malloc_socket(NULL,
> >>>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
> >>>  			RTE_CACHE_LINE_SIZE, node);
> >>> -	vq->async_descs_split = rte_malloc_socket(NULL,
> >>> +	if (vq_is_packed(dev)) {
> >>> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> >>> +			vq->size * sizeof(struct vring_used_elem_packed),
> >>> +			RTE_CACHE_LINE_SIZE, node);
> >>> +	} else {
> >>> +		vq->async_descs_split = rte_malloc_socket(NULL,
> >>>  			vq->size * sizeof(struct vring_used_elem),
> >>>  			RTE_CACHE_LINE_SIZE, node);
> >>> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> >>> -		!vq->it_pool || !vq->vec_pool) {
> >>> +	}
> >>> +
> >>> +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
> >>> +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
> >>>  		vhost_free_async_mem(vq);
> >>>  		VHOST_LOG_CONFIG(ERR,
> >>>  				"async register failed: cannot allocate
> >> memory for vq data "
> >>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> >>> index f628714c2..fe131ae8f 100644
> >>> --- a/lib/librte_vhost/vhost.h
> >>> +++ b/lib/librte_vhost/vhost.h
> >>> @@ -201,9 +201,14 @@ struct vhost_virtqueue {
> >>>  	uint16_t	async_pkts_idx;
> >>>  	uint16_t	async_pkts_inflight_n;
> >>>  	uint16_t	async_last_pkts_n;
> >>> -	struct vring_used_elem  *async_descs_split;
> >>> +	union {
> >>> +		struct vring_used_elem  *async_descs_split;
> >>> +		struct vring_used_elem_packed *async_buffers_packed;
> >>> +	};
> >>>  	uint16_t async_desc_idx;
> >>> +	uint16_t async_packed_buffer_idx;
> >>
> >> Don't dupplicate variable names, async_desc_idx can be reused for
> >> packed ring. Also, they are representing the same thing, why use desc
> >> in one case and buffer in the other?
> >
> > The main reason is that the unit of the packed used ring is buffer, which can
> contain many desc.
> > I think using desc_idx will cause ambiguity, but if you think that I should
> reuse the desc_idx, I have no problem with that.
> 
> OK, in this case please use a union not to waste memory.

Sure, I'll fix it.

> 
> >>
> >>>  	uint16_t last_async_desc_idx;
> >>> +	uint16_t last_async_buffer_idx;
> >>
> >> Same remark here.
> >>
> >>>  	/* vq async features */
> >>>  	bool		async_inorder;
> >>> diff --git a/lib/librte_vhost/virtio_net.c
> >>> b/lib/librte_vhost/virtio_net.c index c43ab0093..410be9678 100644
> >>> --- a/lib/librte_vhost/virtio_net.c
> >>> +++ b/lib/librte_vhost/virtio_net.c
> >>> @@ -363,14 +363,14 @@
> >>> vhost_shadow_dequeue_single_packed_inorder(struct
> vhost_virtqueue
> >> *vq,
> >>> }
> >>>
> >>>  static __rte_always_inline void
> >>> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> >>> -				   struct vhost_virtqueue *vq,
> >>> -				   uint32_t len[],
> >>> -				   uint16_t id[],
> >>> -				   uint16_t count[],
> >>> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> >>> +				   uint32_t *len,
> >>> +				   uint16_t *id,
> >>> +				   uint16_t *count,
> >>>  				   uint16_t num_buffers)
> >>>  {
> >>>  	uint16_t i;
> >>> +
> >>>  	for (i = 0; i < num_buffers; i++) {
> >>>  		/* enqueue shadow flush action aligned with batch num */
> >>>  		if (!vq->shadow_used_idx)
> >>> @@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct
> >> virtio_net *dev,
> >>>  		vq->shadow_aligned_idx += count[i];
> >>>  		vq->shadow_used_idx++;
> >>>  	}
> >>> +}
> >>> +
> >>> +static __rte_always_inline void
> >>> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> >>> +				   struct vhost_virtqueue *vq,
> >>> +				   uint32_t *len,
> >>> +				   uint16_t *id,
> >>> +				   uint16_t *count,
> >>> +				   uint16_t num_buffers)
> >>> +{
> >>> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> >>>
> >>>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
> >>>  		do_data_copy_enqueue(dev, vq);
> >>> @@ -1481,6 +1492,62 @@ shadow_ring_store(struct vhost_virtqueue
> *vq,
> >> void *shadow_ring, void *d_ring,
> >>>  	}
> >>>  }
> >>>
> >>> +static __rte_always_inline void
> >>> +vhost_update_used_packed(struct vhost_virtqueue *vq,
> >>> +			struct vring_used_elem_packed *shadow_ring,
> >>> +			uint16_t count)
> >>> +{
> >>> +	if (count == 0)
> >>> +		return;
> >>
> >> Move this after the variables declaration.
> >
> > Sure.
> >
> >>
> >>> +
> >>> +	int i;
> >>> +	uint16_t used_idx = vq->last_used_idx;
> >>> +	uint16_t head_idx = vq->last_used_idx;
> >>> +	uint16_t head_flags = 0;
> >>> +
> >>> +	/* Split loop in two to save memory barriers */
> >>> +	for (i = 0; i < count; i++) {
> >>> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> >>> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> >>> +
> >>> +		used_idx += shadow_ring[i].count;
> >>> +		if (used_idx >= vq->size)
> >>> +			used_idx -= vq->size;
> >>> +	}
> >>> +
> >>> +	/* The ordering for storing desc flags needs to be enforced. */
> >>> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> >>> +
> >>> +	for (i = 0; i < count; i++) {
> >>> +		uint16_t flags;
> >>> +
> >>> +		if (vq->shadow_used_packed[i].len)
> >>> +			flags = VRING_DESC_F_WRITE;
> >>> +		else
> >>> +			flags = 0;
> >>> +
> >>> +		if (vq->used_wrap_counter) {
> >>> +			flags |= VRING_DESC_F_USED;
> >>> +			flags |= VRING_DESC_F_AVAIL;
> >>> +		} else {
> >>> +			flags &= ~VRING_DESC_F_USED;
> >>> +			flags &= ~VRING_DESC_F_AVAIL;
> >>> +		}
> >>> +
> >>> +		if (i > 0) {
> >>> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> >>> +
> >>> +		} else {
> >>> +			head_idx = vq->last_used_idx;
> >>> +			head_flags = flags;
> >>> +		}
> >>> +
> >>> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> >>> +	}
> >>> +
> >>> +	vq->desc_packed[head_idx].flags = head_flags; }
> >>> +
> >>>  static __rte_noinline uint32_t
> >>>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >>>  	struct vhost_virtqueue *vq, uint16_t queue_id, @@ -1656,6
> >> +1723,294
> >>> @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >>>  	return pkt_idx;
> >>>  }
> >>>
> >>> +static __rte_always_inline int
> >>> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> >>> +			    struct vhost_virtqueue *vq,
> >>> +			    struct rte_mbuf *pkt,
> >>> +			    struct buf_vector *buf_vec,
> >>> +			    uint16_t *nr_descs,
> >>> +			    uint16_t *nr_buffers,
> >>> +			    struct vring_packed_desc *async_descs,
> >>> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> >>> +			    struct rte_vhost_iov_iter *src_it,
> >>> +			    struct rte_vhost_iov_iter *dst_it) {
> >>> +	uint16_t nr_vec = 0;
> >>> +	uint16_t avail_idx = vq->last_avail_idx;
> >>> +	uint16_t max_tries, tries = 0;
> >>> +	uint16_t buf_id = 0;
> >>> +	uint32_t len = 0;
> >>> +	uint16_t desc_count = 0;
> >>> +	uint32_t size = pkt->pkt_len + sizeof(struct
> >> virtio_net_hdr_mrg_rxbuf);
> >>> +	uint32_t buffer_len[vq->size];
> >>> +	uint16_t buffer_buf_id[vq->size];
> >>> +	uint16_t buffer_desc_count[vq->size];
> >>> +	*nr_buffers = 0;
> >>> +
> >>> +	if (rxvq_is_mergeable(dev))
> >>> +		max_tries = vq->size - 1;
> >>> +	else
> >>> +		max_tries = 1;
> >>> +
> >>> +	while (size > 0) {
> >>> +		/*
> >>> +		 * if we tried all available ring items, and still
> >>> +		 * can't get enough buf, it means something abnormal
> >>> +		 * happened.
> >>> +		 */
> >>> +		if (unlikely(++tries > max_tries))
> >>> +			return -1;
> >>> +
> >>> +		if (unlikely(fill_vec_buf_packed(dev, vq,
> >>> +						avail_idx, &desc_count,
> >>> +						buf_vec, &nr_vec,
> >>> +						&buf_id, &len,
> >>> +						VHOST_ACCESS_RW) < 0))
> >>> +			return -1;
> >>> +
> >>> +		len = RTE_MIN(len, size);
> >>> +		size -= len;
> >>> +
> >>> +		buffer_len[*nr_buffers] = len;
> >>> +		buffer_buf_id[*nr_buffers] = buf_id;
> >>> +		buffer_desc_count[*nr_buffers] = desc_count;
> >>> +		*nr_buffers += 1;
> >>> +
> >>> +		*nr_descs += desc_count;
> >>> +		avail_idx += desc_count;
> >>> +		if (avail_idx >= vq->size)
> >>> +			avail_idx -= vq->size;
> >>> +	}
> >>> +
> >>> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> >>> +		src_iovec, dst_iovec, src_it, dst_it) < 0)
> >>> +		return -1;
> >>> +	/* store descriptors for DMA */
> >>> +	if (avail_idx >= *nr_descs)
> >>> +		rte_memcpy(async_descs,
> >>> +			&vq->desc_packed[vq->last_avail_idx],
> >>> +			*nr_descs * sizeof(struct vring_packed_desc));
> >>
> >> Please add brackets for the 'if' since there are for the 'else'.
> >
> > Sure, sorry for that.
> >
> >>
> >>> +	else {
> >>> +		uint16_t nr_copy = vq->size - vq->last_avail_idx;
> >>> +		rte_memcpy(async_descs,
> >>> +			&vq->desc_packed[vq->last_avail_idx],
> >>> +			nr_copy * sizeof(struct vring_packed_desc));
> >>> +		rte_memcpy(async_descs + nr_copy,
> >>> +			vq->desc_packed, (*nr_descs - nr_copy) *
> >>> +			sizeof(struct vring_packed_desc));
> >>> +	}
> >>> +
> >>> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> >>> +					   buffer_desc_count, *nr_buffers);
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static __rte_always_inline int16_t
> >>> +virtio_dev_rx_async_single_packed(struct virtio_net *dev,
> >>> +			    struct vhost_virtqueue *vq,
> >>> +			    struct rte_mbuf *pkt,
> >>> +			    uint16_t *nr_descs, uint16_t *nr_buffers,
> >>> +			    struct vring_packed_desc *async_descs,
> >>> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> >>> +			    struct rte_vhost_iov_iter *src_it,
> >>> +			    struct rte_vhost_iov_iter *dst_it) {
> >>> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> >>> +	*nr_descs = 0;
> >>> +	*nr_buffers = 0;
> >>> +
> >>> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> >> buf_vec,
> >>> +						 nr_descs,
> >>> +						 nr_buffers,
> >>> +						 async_descs,
> >>> +						 src_iovec, dst_iovec,
> >>> +						 src_it, dst_it) < 0)) {
> >>> +		VHOST_LOG_DATA(DEBUG,
> >>> +				"(%d) failed to get enough desc from vring\n",
> >>> +				dev->vid);
> >>> +		return -1;
> >>> +	}
> >>> +
> >>> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> >> index %d\n",
> >>> +			dev->vid, vq->last_avail_idx,
> >>> +			vq->last_avail_idx + *nr_descs);
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static __rte_noinline uint32_t
> >>> +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
> >>> +	struct vhost_virtqueue *vq, uint16_t queue_id,
> >>> +	struct rte_mbuf **pkts, uint32_t count,
> >>> +	struct rte_mbuf **comp_pkts, uint32_t *comp_count) {
> >>> +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> >>> +	uint16_t async_descs_idx = 0;
> >>> +	uint16_t num_buffers;
> >>> +	uint16_t num_desc;
> >>> +
> >>> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> >>> +	struct iovec *vec_pool = vq->vec_pool;
> >>> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> >>> +	struct iovec *src_iovec = vec_pool;
> >>> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> >> 1);
> >>> +	struct rte_vhost_iov_iter *src_it = it_pool;
> >>> +	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> >>> +	uint16_t slot_idx = 0;
> >>> +	uint16_t segs_await = 0;
> >>> +	uint16_t iovec_idx = 0, it_idx = 0;
> >>> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> >>> +	uint32_t n_pkts = 0, pkt_err = 0;
> >>> +	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> >>> +	struct vring_packed_desc async_descs[vq->size];
> >>> +
> >>> +	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size -
> >>> +1)]);
> >>
> >> The size of the ring is not necessarily a power of two with packed ring.
> >
> > For the size of the ring is not necessarily a power of two, so maybe I
> > can use codes like Indx % vq->size  ?
> > I'm not sure if it's a good way to do that.
> 
> In this case it is OK.

OK. I'll fix them in the next version.

Thanks a lot.
Cheng

> 
> >>
> >>> +
> >>> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> >>> +		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq,
> >>> +						pkts[pkt_idx],
> >>> +						&num_desc, &num_buffers,
> >>> +
> >> 	&async_descs[async_descs_idx],
> >>> +						&src_iovec[iovec_idx],
> >>> +						&dst_iovec[iovec_idx],
> >>> +						&src_it[it_idx],
> >>> +						&dst_it[it_idx]) < 0))
> >>> +			break;
> >>> +
> >>> +		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> >> index %d\n",
> >>> +			dev->vid, vq->last_avail_idx,
> >>> +			vq->last_avail_idx + num_desc);
> >>> +
> >>> +		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
> >>> +			(vq->size - 1);
> >>
> >> Same here.
> >
> > Sure.
> >
> >>
> >>> +		if (src_it[it_idx].count) {
> >>> +			uint16_t from, to;
> >>> +
> >>> +			async_descs_idx += num_desc;
> >>> +			async_fill_desc(&tdes[pkt_burst_idx++],
> >> &src_it[it_idx],
> >>> +					&dst_it[it_idx]);
> >>> +			pkts_info[slot_idx].descs = num_desc;
> >>> +			pkts_info[slot_idx].nr_buffers = num_buffers;
> >>> +			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
> >>> +			num_async_pkts++;
> >>> +			iovec_idx += src_it[it_idx].nr_segs;
> >>> +			it_idx += 2;
> >>> +
> >>> +			segs_await += src_it[it_idx].nr_segs;
> >>> +
> >>> +			/**
> >>> +			 * recover shadow used ring and keep DMA-occupied
> >>> +			 * descriptors.
> >>> +			 */
> >>> +			from = vq->shadow_used_idx - num_buffers;
> >>> +			to = vq->async_packed_buffer_idx & (vq->size - 1);
> >>> +			shadow_ring_store(vq, vq->shadow_used_packed,
> >>> +					vq->async_buffers_packed,
> >>> +					from, to, num_buffers,
> >>> +					sizeof(struct
> >> vring_used_elem_packed));
> >>> +
> >>> +			vq->async_packed_buffer_idx += num_buffers;
> >>> +			vq->shadow_used_idx -= num_buffers;
> >>> +		} else
> >>
> >> Brackets needed.
> >
> > Sure.
> >
> >>
> >>> +			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> >>> +
> >>> +		vq_inc_last_avail_packed(vq, num_desc);
> >>> +
> >>> +		/*
> >>> +		 * conditions to trigger async device transfer:
> >>> +		 * - buffered packet number reaches transfer threshold
> >>> +		 * - unused async iov number is less than max vhost vector
> >>> +		 */
> >>> +		if (unlikely(pkt_burst_idx >=
> >> VHOST_ASYNC_BATCH_THRESHOLD ||
> >>> +			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
> >>> +			BUF_VECTOR_MAX))) {
> >>> +			n_pkts = vq->async_ops.transfer_data(dev->vid,
> >>> +					queue_id, tdes, 0, pkt_burst_idx);
> >>> +			iovec_idx = 0;
> >>> +			it_idx = 0;
> >>> +			segs_await = 0;
> >>> +			vq->async_pkts_inflight_n += n_pkts;
> >>> +
> >>> +			if (unlikely(n_pkts < pkt_burst_idx)) {
> >>> +				/*
> >>> +				 * log error packets number here and do
> >> actual
> >>> +				 * error processing when applications poll
> >>> +				 * completion
> >>> +				 */
> >>> +				pkt_err = pkt_burst_idx - n_pkts;
> >>> +				pkt_burst_idx = 0;
> >>> +				pkt_idx++;
> >>> +				break;
> >>> +			}
> >>> +
> >>> +			pkt_burst_idx = 0;
> >>> +		}
> >>> +	}
> >>> +
> >>> +	if (pkt_burst_idx) {
> >>> +		n_pkts = vq->async_ops.transfer_data(dev->vid,
> >>> +				queue_id, tdes, 0, pkt_burst_idx);
> >>> +		vq->async_pkts_inflight_n += n_pkts;
> >>> +
> >>> +		if (unlikely(n_pkts < pkt_burst_idx))
> >>> +			pkt_err = pkt_burst_idx - n_pkts;
> >>> +	}
> >>> +
> >>> +	do_data_copy_enqueue(dev, vq);
> >>> +
> >>> +	if (unlikely(pkt_err)) {
> >>> +		uint16_t descs_err = 0;
> >>> +		uint16_t buffers_err = 0;
> >>> +
> >>> +		num_async_pkts -= pkt_err;
> >>> +		pkt_idx -= pkt_err;
> >>> +	/* calculate the sum of buffers and descs of DMA-error packets. */
> >>> +		while (pkt_err-- > 0) {
> >>> +			descs_err +=
> >>> +				pkts_info[slot_idx & (vq->size - 1)].descs;
> >>
> >> The size of the ring is not necessarily a power of two with packed ring.
> >
> > Will be fixed.
> >
> >>
> >>> +			buffers_err +=
> >>> +				pkts_info[slot_idx & (vq->size -
> >> 1)].nr_buffers;
> >>
> >> Ditto.
> >
> > Will be fixed.
> >
> >>
> >>> +			slot_idx--;
> >>> +		}
> >>> +
> >>> +		vq->async_packed_buffer_idx -= buffers_err;
> >>> +
> >>> +		if (vq->last_avail_idx >= descs_err) {
> >>> +			vq->last_avail_idx -= descs_err;
> >>> +
> >>> +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> >>> +				&async_descs[async_descs_idx - descs_err],
> >>> +				descs_err * sizeof(struct
> >> vring_packed_desc));
> >>> +		} else {
> >>> +			uint16_t nr_copy;
> >>> +
> >>> +			vq->last_avail_idx = vq->last_avail_idx + vq->size
> >>> +						- descs_err;
> >>> +			nr_copy = vq->size - vq->last_avail_idx;
> >>> +			rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> >>> +				&async_descs[async_descs_idx - descs_err],
> >>> +				nr_copy * sizeof(struct vring_packed_desc));
> >>> +			descs_err -= nr_copy;
> >>> +			rte_memcpy(vq->desc_packed,
> >>> +				&async_descs[async_descs_idx - descs_err],
> >>> +				descs_err * sizeof(struct
> >> vring_packed_desc));
> >>> +			vq->avail_wrap_counter ^= 1;
> >>> +		}
> >>> +
> >>> +		num_done_pkts = pkt_idx - num_async_pkts;
> >>> +	}
> >>
> >> This error handling could be moved in a dedicated function.
> >
> > Sure, will fix it in the next version.
> >
> >>
> >>> +	vq->async_pkts_idx += num_async_pkts;
> >>> +	*comp_count = num_done_pkts;
> >>> +
> >>> +	if (likely(vq->shadow_used_idx)) {
> >>> +		vhost_flush_enqueue_shadow_packed(dev, vq);
> >>> +		vhost_vring_call_packed(dev, vq);
> >>> +	}
> >>> +
> >>> +	return pkt_idx;
> >>> +}
> >>
> >> Above function is very big and complex, it should be possible to
> >> split it in several ones to make it maintainable.
> >
> > I think move the error handling code will make it smaller.
> >
> > Thanks.
> > Cheng
> >
> >>
> >>> +
> >>>  static __rte_always_inline void
> >>>  write_back_completed_descs_split(struct vhost_virtqueue *vq,
> >>> uint16_t
> >>> n_descs)  { @@ -1693,12 +2048,40 @@
> >>> write_back_completed_descs_split(struct vhost_virtqueue *vq,
> >>> uint16_t
> >> n_descs)
> >>>  	} while (nr_left > 0);
> >>>  }
> >>>
> >>> +static __rte_always_inline void
> >>> +write_back_completed_descs_packed(struct vhost_virtqueue *vq,
> >>> +				uint16_t n_buffers)
> >>> +{
> >>> +	uint16_t nr_left = n_buffers;
> >>> +	uint16_t from, to;
> >>> +
> >>> +	do {
> >>> +		from = vq->last_async_buffer_idx &
> >>> +						(vq->size - 1);
> >>> +		to = (from + nr_left) & (vq->size - 1);
> >>
> >> The size of the ring is not necessarily a power of two with packed ring.
> >
> > Sure.
> >
> >>
> >>> +		if (to > from) {
> >>> +			vhost_update_used_packed(vq,
> >>> +				vq->async_buffers_packed + from,
> >>> +				to - from);
> >>> +			vq->last_async_buffer_idx += nr_left;
> >>> +			nr_left = 0;
> >>> +		} else {
> >>> +			vhost_update_used_packed(vq,
> >>> +				vq->async_buffers_packed + from,
> >>> +				vq->size - from);
> >>> +			vq->last_async_buffer_idx +=
> >>> +						vq->size - from;
> >>> +			nr_left -= vq->size - from;
> >>> +		}
> >>> +	} while (nr_left > 0);
> >>> +}
> >>> +
> >>>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t
> queue_id,
> >>>  		struct rte_mbuf **pkts, uint16_t count)  {
> >>>  	struct virtio_net *dev = get_device(vid);
> >>>  	struct vhost_virtqueue *vq;
> >>> -	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
> >>> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers =
> >>> +0;
> >>>  	uint16_t start_idx, pkts_idx, vq_size;
> >>>  	struct async_inflight_info *pkts_info;
> >>>  	uint16_t from, i;
> >>> @@ -1740,21 +2123,41 @@ uint16_t
> >> rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> >>>  		goto done;
> >>>  	}
> >>>
> >>> -	for (i = 0; i < n_pkts_put; i++) {
> >>> -		from = (start_idx + i) & (vq_size - 1);
> >>> -		n_descs += pkts_info[from].descs;
> >>> -		pkts[i] = pkts_info[from].mbuf;
> >>> +	if (vq_is_packed(dev)) {
> >>> +		for (i = 0; i < n_pkts_put; i++) {
> >>> +			from = (start_idx + i) & (vq_size - 1);
> >>
> >> Unlike split ring, packed ring size is not necessarily a power of 2.
> >
> > Sure.
> > Thanks.
> >
> >>
> >>> +			n_buffers += pkts_info[from].nr_buffers;
> >>> +			pkts[i] = pkts_info[from].mbuf;
> >>> +		}
> >>> +	} else {
> >>> +		for (i = 0; i < n_pkts_put; i++) {
> >>> +			from = (start_idx + i) & (vq_size - 1);
> >>> +			n_descs += pkts_info[from].descs;
> >>> +			pkts[i] = pkts_info[from].mbuf;
> >>> +		}
> >>>  	}
> >>> +
> >>>  	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
> >>>  	vq->async_pkts_inflight_n -= n_pkts_put;
> >>>
> >>>  	if (likely(vq->enabled && vq->access_ok)) {
> >>> -		write_back_completed_descs_split(vq, n_descs);
> >>> +		if (vq_is_packed(dev)) {
> >>> +			write_back_completed_descs_packed(vq,
> >> n_buffers);
> >>>
> >>> -		__atomic_add_fetch(&vq->used->idx, n_descs,
> >> __ATOMIC_RELEASE);
> >>> -		vhost_vring_call_split(dev, vq);
> >>> -	} else
> >>> -		vq->last_async_desc_idx += n_descs;
> >>> +			vhost_vring_call_packed(dev, vq);
> >>> +		} else {
> >>> +			write_back_completed_descs_split(vq, n_descs);
> >>> +
> >>> +			__atomic_add_fetch(&vq->used->idx, n_descs,
> >>> +					__ATOMIC_RELEASE);
> >>> +			vhost_vring_call_split(dev, vq);
> >>> +		}
> >>> +	} else {
> >>> +		if (vq_is_packed(dev))
> >>> +			vq->last_async_buffer_idx += n_buffers;
> >>> +		else
> >>> +			vq->last_async_desc_idx += n_descs;
> >>> +	}
> >>>
> >>>  done:
> >>>  	rte_spinlock_unlock(&vq->access_lock);
> >>> @@ -1795,9 +2198,10 @@ virtio_dev_rx_async_submit(struct virtio_net
> >> *dev, uint16_t queue_id,
> >>>  	if (count == 0)
> >>>  		goto out;
> >>>
> >>> -	/* TODO: packed queue not implemented */
> >>>  	if (vq_is_packed(dev))
> >>> -		nb_tx = 0;
> >>> +		nb_tx = virtio_dev_rx_async_submit_packed(dev,
> >>> +				vq, queue_id, pkts, count, comp_pkts,
> >>> +				comp_count);
> >>>  	else
> >>>  		nb_tx = virtio_dev_rx_async_submit_split(dev,
> >>>  				vq, queue_id, pkts, count, comp_pkts,
> >>>
> >


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
                   ` (3 preceding siblings ...)
  2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-13 14:55 ` Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
                     ` (3 more replies)
  2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
                   ` (2 subsequent siblings)
  7 siblings, 4 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-13 14:55 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
set cleans async split ring codes and enables packed ring in async
vhost data path. Batch datapath is also enabled in async vhost
packed ring.

v6:
 * fix some typos in commit log
 * improve index usage
 * remove shadow_ring_store()
 * add store_dma_desc_info_split() store_dma_desc_info_packed()
 * remove some checks in vhost_free_async_mem()
 * change index calculation since the size isn't necessarily a power of 2
 * move error handling in a dedicated function
 * clean codes
v5:
 * clean some codes for packed ring datapath
 * fix an index error in shadow_ring_store()
v4:
  * change the patch structure
  * clean code for async split ring
  * reuse some code from split ring
  * change the error handler for DMA-copy packet
  * add check for malloc
  * remove useless code
  * add doc update
v3:
  * fix error handler for DMA-copy packet
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

Cheng Jiang (4):
  vhost: abstract and reorganize async split ring code
  vhost: add support for packed ring in async vhost
  vhost: add batch datapath for async vhost packed ring
  doc: add release note for vhost async packed ring

 doc/guides/rel_notes/release_21_05.rst |   4 +
 lib/librte_vhost/rte_vhost_async.h     |   1 +
 lib/librte_vhost/vhost.c               |  37 +-
 lib/librte_vhost/vhost.h               |  15 +-
 lib/librte_vhost/virtio_net.c          | 593 +++++++++++++++++++++----
 5 files changed, 550 insertions(+), 100 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] vhost: abstract and reorganize async split ring code
  2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-13 14:55   ` Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-13 14:55 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

This patch puts some codes of async vhost split ring into inline
functions to improve the readability of the code. And, it changes
the pointer index style of iterator to make the code more concise.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 132 +++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ff3987860..438bdafd1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1458,6 +1458,22 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
+static __rte_always_inline void
+store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1474,10 +1490,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
 	struct iovec *src_iovec = vec_pool;
 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-	struct rte_vhost_iov_iter *src_it = it_pool;
-	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1511,29 +1526,30 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dev->vid, vq->last_avail_idx,
 			vq->last_avail_idx + num_buffers);
 
-		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
-				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
+				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (it_pool[it_idx].count) {
 			uint16_t from, to;
 
-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
 
 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1557,10 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			store_dma_desc_info_split(vq->shadow_used_split,
+					vq->async_descs_split, vq->size, from, to, num_buffers);
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1578,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
@@ -1639,6 +1641,36 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					nr_copy * sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					size * sizeof(struct vring_used_elem));
+			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
+					(nr_copy - size) * sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1727,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);
 
 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] vhost: add support for packed ring in async vhost
  2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-13 14:55   ` Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-13 14:55 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring. This patch
enables packed ring in async vhost data path to make async vhost
compatible with virtio 1.1 spec.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  37 ++-
 lib/librte_vhost/vhost.h           |  15 +-
 lib/librte_vhost/virtio_net.c      | 432 +++++++++++++++++++++++++++--
 4 files changed, 449 insertions(+), 36 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index a70fe01d8..467d1d5a2 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -340,17 +340,18 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
-	if (vq->async_pkts_info)
-		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
-		rte_free(vq->async_descs_split);
-	if (vq->it_pool)
-		rte_free(vq->it_pool);
-	if (vq->vec_pool)
-		rte_free(vq->vec_pool);
+	rte_free(vq->async_pkts_info);
 
-	vq->async_pkts_info = NULL;
+	rte_free(vq->async_buffers_packed);
+	rte_free(vq->async_descs_split);
+
+	rte_free(vq->it_pool);
+	rte_free(vq->vec_pool);
+
+	vq->async_buffers_packed = NULL;
 	vq->async_descs_split = NULL;
+
+	vq->async_pkts_info = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -1626,10 +1627,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	if (unlikely(vq == NULL || !dev->async_copy))
 		return -1;
 
-	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1667,11 +1667,18 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
-		!vq->it_pool || !vq->vec_pool) {
+	}
+
+	if (!vq->async_buffers_packed || !vq->async_descs_split ||
+		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f628714c2..673335217 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -201,9 +201,18 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
-	uint16_t async_desc_idx;
-	uint16_t last_async_desc_idx;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
+	union {
+		uint16_t async_desc_idx;
+		uint16_t async_packed_buffer_idx;
+	};
+	union {
+		uint16_t last_async_desc_idx;
+		uint16_t last_async_buffer_idx;
+	};
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 438bdafd1..e2b35a319 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,14 +363,14 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
-				   uint32_t len[],
-				   uint16_t id[],
-				   uint16_t count[],
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
 				   uint16_t num_buffers)
 {
 	uint16_t i;
+
 	for (i = 0; i < num_buffers; i++) {
 		/* enqueue shadow flush action aligned with batch num */
 		if (!vq->shadow_used_idx)
@@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1474,6 +1485,23 @@ store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem
 	}
 }
 
+static __rte_always_inline void
+store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
+		struct vring_used_elem_packed *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem_packed);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1641,6 +1669,330 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+vhost_update_used_packed(struct vhost_virtqueue *vq,
+			struct vring_used_elem_packed *shadow_ring,
+			uint16_t count)
+{
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	if (count == 0)
+		return;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+}
+
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count = 0;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
+						&buf_id, &len, VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
+			src_it, dst_it) < 0)
+		return -1;
+	/* store descriptors for DMA */
+	if (avail_idx >= *nr_descs) {
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			*nr_descs * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			nr_copy * sizeof(struct vring_packed_desc));
+		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
+			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
+	}
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
+						 async_descs, src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_always_inline void
+dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
+			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
+			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
+{
+	uint16_t descs_err = 0;
+	uint16_t buffers_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	num_async_pkts -= nr_err;
+	pkt_idx -= nr_err;
+	/* calculate the sum of buffers and descs of DMA-error packets. */
+	while (nr_err-- > 0) {
+		descs_err += pkts_info[slot_idx % vq->size].descs;
+		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
+		slot_idx--;
+	}
+
+	vq->async_packed_buffer_idx -= buffers_err;
+
+	if (vq->last_avail_idx >= descs_err) {
+		vq->last_avail_idx -= descs_err;
+
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy;
+
+		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
+		nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			nr_copy * sizeof(struct vring_packed_desc));
+		descs_err -= nr_copy;
+		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	*num_done_pkts = *pkt_idx - *num_async_pkts;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t async_descs_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct vring_packed_desc async_descs[vq->size];
+
+	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						&async_descs[async_descs_idx],
+						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
+			break;
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
+		if (it_pool[it_idx].count) {
+			uint16_t from, to;
+
+			async_descs_idx += num_desc;
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx % vq->size;
+			store_dma_desc_info_packed(vq->shadow_used_packed,
+					vq->async_buffers_packed, vq->size, from, to, num_buffers);
+
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else {
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+		}
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, pkt_burst_idx);
+			iovec_idx = 0;
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err))
+		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
+					&pkt_idx, &num_async_pkts, &num_done_pkts);
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 {
@@ -1671,12 +2023,35 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 	} while (nr_left > 0);
 }
 
+static __rte_always_inline void
+write_back_completed_descs_packed(struct vhost_virtqueue *vq,
+				uint16_t n_buffers)
+{
+	uint16_t nr_left = n_buffers;
+	uint16_t from, to;
+
+	do {
+		from = vq->last_async_buffer_idx % vq->size;
+		to = (from + nr_left) % vq->size;
+		if (to > from) {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
+			vq->last_async_buffer_idx += nr_left;
+			nr_left = 0;
+		} else {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
+				vq->size - from);
+			vq->last_async_buffer_idx += vq->size - from;
+			nr_left -= vq->size - from;
+		}
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1701,7 +2076,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	rte_spinlock_lock(&vq->access_lock);
 
-	pkts_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_idx = vq->async_pkts_idx % vq->size;
 	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
@@ -1718,21 +2093,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		write_back_completed_descs_split(vq, n_descs);
+		if (vq_is_packed(dev)) {
+			write_back_completed_descs_packed(vq, n_buffers);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			write_back_completed_descs_split(vq, n_descs);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1773,9 +2168,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-13 14:55   ` Cheng Jiang
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-13 14:55 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add batch datapath for async vhost packed ring to improve the
performance of small packet processing.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e2b35a319..42439a86d 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1725,6 +1725,29 @@ vhost_update_used_packed(struct vhost_virtqueue *vq,
 	vq->desc_packed[head_idx].flags = head_flags;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
+{
+	uint16_t i;
+	uint32_t cpy_threshold = vq->async_threshold;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
+			return -1;
+	}
+	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			comp_pkts[(*pkt_done)++] = pkts[i];
+
+		return 0;
+	}
+
+	return -1;
+}
+
 static __rte_always_inline int
 vhost_enqueue_async_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
@@ -1875,6 +1898,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t remained = count;
 	uint16_t async_descs_idx = 0;
 	uint16_t num_buffers;
 	uint16_t num_desc;
@@ -1892,9 +1916,17 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
 	struct vring_packed_desc async_descs[vq->size];
 
-	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+	do {
+		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_batch_packed(dev, vq,
+				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
+				pkt_idx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
 						&num_desc, &num_buffers,
 						&async_descs[async_descs_idx],
@@ -1937,6 +1969,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 		}
 
+		pkt_idx++;
+		remained--;
 		vq_inc_last_avail_packed(vq, num_desc);
 
 		/*
@@ -1961,13 +1995,12 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 				 */
 				pkt_err = pkt_burst_idx - n_pkts;
 				pkt_burst_idx = 0;
-				pkt_idx++;
 				break;
 			}
 
 			pkt_burst_idx = 0;
 		}
-	}
+	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] doc: add release note for vhost async packed ring
  2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
                     ` (2 preceding siblings ...)
  2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-13 14:55   ` Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-13 14:55 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add release note for the support of vhost async packed ring.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 374d6d98e..eb5200669 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -131,6 +131,10 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added support for vhost async packed ring data path.**
+
+  Added packed ring support for async vhost.
+
 
 Removed Items
 -------------
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
                   ` (4 preceding siblings ...)
  2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-14  6:13 ` Cheng Jiang
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
                     ` (3 more replies)
  2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
  7 siblings, 4 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-14  6:13 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
set cleans async split ring codes and enables packed ring in async
vhost data path. Batch datapath is also enabled in async vhost
packed ring.

v7:
 * fix compile issues
 * add argument *dev in vhost_free_async_mem() for ring type decision
v6:
 * fix some typos in commit log
 * improve index usage
 * remove shadow_ring_store()
 * add store_dma_desc_info_split() store_dma_desc_info_packed()
 * remove some checks in vhost_free_async_mem()
 * change index calculation since the size isn't necessarily a power of 2
 * move error handling in a dedicated function
 * clean codes
v5:
 * clean some codes for packed ring datapath
 * fix an index error in shadow_ring_store()
v4:
  * change the patch structure
  * clean code for async split ring
  * reuse some code from split ring
  * change the error handler for DMA-copy packet
  * add check for malloc
  * remove useless code
  * add doc update
v3:
  * fix error handler for DMA-copy packet
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

Cheng Jiang (4):
  vhost: abstract and reorganize async split ring code
  vhost: add support for packed ring in async vhost
  vhost: add batch datapath for async vhost packed ring
  doc: add release note for vhost async packed ring

 doc/guides/rel_notes/release_21_05.rst |   4 +
 lib/librte_vhost/rte_vhost_async.h     |   1 +
 lib/librte_vhost/vhost.c               |  49 +-
 lib/librte_vhost/vhost.h               |  15 +-
 lib/librte_vhost/virtio_net.c          | 593 +++++++++++++++++++++----
 5 files changed, 557 insertions(+), 105 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code
  2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-14  6:13   ` Cheng Jiang
  2021-04-14 12:24     ` Maxime Coquelin
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-04-14  6:13 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

This patch puts some codes of async vhost split ring into inline
functions to improve the readability of the code. And, it changes
the pointer index style of iterator to make the code more concise.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 132 +++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ff3987860..438bdafd1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1458,6 +1458,22 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
+static __rte_always_inline void
+store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1474,10 +1490,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
 	struct iovec *src_iovec = vec_pool;
 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-	struct rte_vhost_iov_iter *src_it = it_pool;
-	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1511,29 +1526,30 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dev->vid, vq->last_avail_idx,
 			vq->last_avail_idx + num_buffers);
 
-		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
-				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
+				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (it_pool[it_idx].count) {
 			uint16_t from, to;
 
-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
 
 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1557,10 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			store_dma_desc_info_split(vq->shadow_used_split,
+					vq->async_descs_split, vq->size, from, to, num_buffers);
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1578,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
@@ -1639,6 +1641,36 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					nr_copy * sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					size * sizeof(struct vring_used_elem));
+			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
+					(nr_copy - size) * sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1727,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);
 
 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost
  2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-14  6:13   ` Cheng Jiang
  2021-04-14 13:40     ` Maxime Coquelin
  2021-04-15  2:02     ` Hu, Jiayu
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 2 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-14  6:13 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring. This patch
enables packed ring in async vhost data path to make async vhost
compatible with virtio 1.1 spec.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  49 ++--
 lib/librte_vhost/vhost.h           |  15 +-
 lib/librte_vhost/virtio_net.c      | 432 +++++++++++++++++++++++++++--
 4 files changed, 456 insertions(+), 41 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index a70fe01d8..f509186c6 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -338,19 +338,22 @@ cleanup_device(struct virtio_net *dev, int destroy)
 }
 
 static void
-vhost_free_async_mem(struct vhost_virtqueue *vq)
+vhost_free_async_mem(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
-	if (vq->async_pkts_info)
-		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
+	rte_free(vq->async_pkts_info);
+
+	if (vq_is_packed(dev)) {
+		rte_free(vq->async_buffers_packed);
+		vq->async_buffers_packed = NULL;
+	} else {
 		rte_free(vq->async_descs_split);
-	if (vq->it_pool)
-		rte_free(vq->it_pool);
-	if (vq->vec_pool)
-		rte_free(vq->vec_pool);
+		vq->async_descs_split = NULL;
+	}
+
+	rte_free(vq->it_pool);
+	rte_free(vq->vec_pool);
 
 	vq->async_pkts_info = NULL;
-	vq->async_descs_split = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -360,10 +363,10 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
 	if (vq_is_packed(dev))
 		rte_free(vq->shadow_used_packed);
-	else {
+	else
 		rte_free(vq->shadow_used_split);
-		vhost_free_async_mem(vq);
-	}
+
+	vhost_free_async_mem(dev, vq);
 	rte_free(vq->batch_copy_elems);
 	if (vq->iotlb_pool)
 		rte_mempool_free(vq->iotlb_pool);
@@ -1626,10 +1629,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	if (unlikely(vq == NULL || !dev->async_copy))
 		return -1;
 
-	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1667,12 +1669,19 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct vring_used_elem),
 			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
-		!vq->it_pool || !vq->vec_pool) {
-		vhost_free_async_mem(vq);
+	}
+
+	if (!vq->async_buffers_packed || !vq->async_descs_split ||
+		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
+		vhost_free_async_mem(dev, vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
 				"(vid %d, qid: %d)\n", vid, queue_id);
@@ -1728,7 +1737,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		goto out;
 	}
 
-	vhost_free_async_mem(vq);
+	vhost_free_async_mem(dev, vq);
 
 	vq->async_ops.transfer_data = NULL;
 	vq->async_ops.check_completed_copies = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f628714c2..673335217 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -201,9 +201,18 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
-	uint16_t async_desc_idx;
-	uint16_t last_async_desc_idx;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
+	union {
+		uint16_t async_desc_idx;
+		uint16_t async_packed_buffer_idx;
+	};
+	union {
+		uint16_t last_async_desc_idx;
+		uint16_t last_async_buffer_idx;
+	};
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 438bdafd1..54e11e3a5 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,14 +363,14 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
-				   uint32_t len[],
-				   uint16_t id[],
-				   uint16_t count[],
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
 				   uint16_t num_buffers)
 {
 	uint16_t i;
+
 	for (i = 0; i < num_buffers; i++) {
 		/* enqueue shadow flush action aligned with batch num */
 		if (!vq->shadow_used_idx)
@@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1474,6 +1485,23 @@ store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem
 	}
 }
 
+static __rte_always_inline void
+store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
+		struct vring_used_elem_packed *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem_packed);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1641,6 +1669,330 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+vhost_update_used_packed(struct vhost_virtqueue *vq,
+			struct vring_used_elem_packed *shadow_ring,
+			uint16_t count)
+{
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	if (count == 0)
+		return;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+}
+
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count = 0;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
+						&buf_id, &len, VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
+			src_it, dst_it) < 0)
+		return -1;
+	/* store descriptors for DMA */
+	if (avail_idx >= *nr_descs) {
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			*nr_descs * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			nr_copy * sizeof(struct vring_packed_desc));
+		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
+			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
+	}
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
+						 async_descs, src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_always_inline void
+dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
+			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
+			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
+{
+	uint16_t descs_err = 0;
+	uint16_t buffers_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	*num_async_pkts -= nr_err;
+	*pkt_idx -= nr_err;
+	/* calculate the sum of buffers and descs of DMA-error packets. */
+	while (nr_err-- > 0) {
+		descs_err += pkts_info[slot_idx % vq->size].descs;
+		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
+		slot_idx--;
+	}
+
+	vq->async_packed_buffer_idx -= buffers_err;
+
+	if (vq->last_avail_idx >= descs_err) {
+		vq->last_avail_idx -= descs_err;
+
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy;
+
+		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
+		nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			nr_copy * sizeof(struct vring_packed_desc));
+		descs_err -= nr_copy;
+		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	*num_done_pkts = *pkt_idx - *num_async_pkts;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t async_descs_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct vring_packed_desc async_descs[vq->size];
+
+	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						&async_descs[async_descs_idx],
+						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
+			break;
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
+		if (it_pool[it_idx].count) {
+			uint16_t from, to;
+
+			async_descs_idx += num_desc;
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_packed_buffer_idx % vq->size;
+			store_dma_desc_info_packed(vq->shadow_used_packed,
+					vq->async_buffers_packed, vq->size, from, to, num_buffers);
+
+			vq->async_packed_buffer_idx += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else {
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+		}
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, pkt_burst_idx);
+			iovec_idx = 0;
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err))
+		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
+					&pkt_idx, &num_async_pkts, &num_done_pkts);
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 {
@@ -1671,12 +2023,35 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 	} while (nr_left > 0);
 }
 
+static __rte_always_inline void
+write_back_completed_descs_packed(struct vhost_virtqueue *vq,
+				uint16_t n_buffers)
+{
+	uint16_t nr_left = n_buffers;
+	uint16_t from, to;
+
+	do {
+		from = vq->last_async_buffer_idx % vq->size;
+		to = (from + nr_left) % vq->size;
+		if (to > from) {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
+			vq->last_async_buffer_idx += nr_left;
+			nr_left = 0;
+		} else {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
+				vq->size - from);
+			vq->last_async_buffer_idx += vq->size - from;
+			nr_left -= vq->size - from;
+		}
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1701,7 +2076,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	rte_spinlock_lock(&vq->access_lock);
 
-	pkts_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_idx = vq->async_pkts_idx % vq->size;
 	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
@@ -1718,21 +2093,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		write_back_completed_descs_split(vq, n_descs);
+		if (vq_is_packed(dev)) {
+			write_back_completed_descs_packed(vq, n_buffers);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			write_back_completed_descs_split(vq, n_descs);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx += n_buffers;
+		else
+			vq->last_async_desc_idx += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1773,9 +2168,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v7 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-14  6:13   ` Cheng Jiang
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-14  6:13 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add batch datapath for async vhost packed ring to improve the
performance of small packet processing.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 54e11e3a5..7ba186585 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1725,6 +1725,29 @@ vhost_update_used_packed(struct vhost_virtqueue *vq,
 	vq->desc_packed[head_idx].flags = head_flags;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
+{
+	uint16_t i;
+	uint32_t cpy_threshold = vq->async_threshold;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
+			return -1;
+	}
+	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			comp_pkts[(*pkt_done)++] = pkts[i];
+
+		return 0;
+	}
+
+	return -1;
+}
+
 static __rte_always_inline int
 vhost_enqueue_async_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
@@ -1875,6 +1898,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t remained = count;
 	uint16_t async_descs_idx = 0;
 	uint16_t num_buffers;
 	uint16_t num_desc;
@@ -1892,9 +1916,17 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
 	struct vring_packed_desc async_descs[vq->size];
 
-	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+	do {
+		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_batch_packed(dev, vq,
+				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
+				pkt_idx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
 						&num_desc, &num_buffers,
 						&async_descs[async_descs_idx],
@@ -1937,6 +1969,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 		}
 
+		pkt_idx++;
+		remained--;
 		vq_inc_last_avail_packed(vq, num_desc);
 
 		/*
@@ -1961,13 +1995,12 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 				 */
 				pkt_err = pkt_burst_idx - n_pkts;
 				pkt_burst_idx = 0;
-				pkt_idx++;
 				break;
 			}
 
 			pkt_burst_idx = 0;
 		}
-	}
+	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v7 4/4] doc: add release note for vhost async packed ring
  2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
                     ` (2 preceding siblings ...)
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-14  6:13   ` Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-14  6:13 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add release note for the support of vhost async packed ring.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 374d6d98e..eb5200669 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -131,6 +131,10 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added support for vhost async packed ring data path.**
+
+  Added packed ring support for async vhost.
+
 
 Removed Items
 -------------
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-14 12:24     ` Maxime Coquelin
  0 siblings, 0 replies; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-14 12:24 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/14/21 8:13 AM, Cheng Jiang wrote:
> This patch puts some codes of async vhost split ring into inline
> functions to improve the readability of the code. And, it changes
> the pointer index style of iterator to make the code more concise.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/virtio_net.c | 132 +++++++++++++++++-----------------
>  1 file changed, 66 insertions(+), 66 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-14 13:40     ` Maxime Coquelin
  2021-04-15  5:42       ` Jiang, Cheng1
  2021-04-15  2:02     ` Hu, Jiayu
  1 sibling, 1 reply; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-14 13:40 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/14/21 8:13 AM, Cheng Jiang wrote:
> For now async vhost data path only supports split ring. This patch
> enables packed ring in async vhost data path to make async vhost
> compatible with virtio 1.1 spec.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |   1 +
>  lib/librte_vhost/vhost.c           |  49 ++--
>  lib/librte_vhost/vhost.h           |  15 +-
>  lib/librte_vhost/virtio_net.c      | 432 +++++++++++++++++++++++++++--
>  4 files changed, 456 insertions(+), 41 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
> index c855ff875..6faa31f5a 100644
> --- a/lib/librte_vhost/rte_vhost_async.h
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
>  struct async_inflight_info {
>  	struct rte_mbuf *mbuf;
>  	uint16_t descs; /* num of descs inflight */
> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>  };
>  
>  /**
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index a70fe01d8..f509186c6 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -338,19 +338,22 @@ cleanup_device(struct virtio_net *dev, int destroy)
>  }
>  
>  static void
> -vhost_free_async_mem(struct vhost_virtqueue *vq)
> +vhost_free_async_mem(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> -	if (vq->async_pkts_info)
> -		rte_free(vq->async_pkts_info);
> -	if (vq->async_descs_split)
> +	rte_free(vq->async_pkts_info);
> +
> +	if (vq_is_packed(dev)) {
> +		rte_free(vq->async_buffers_packed);
> +		vq->async_buffers_packed = NULL;
> +	} else {

Doing this is not necessary:

	rte_free(vq->async_buffers_packed);
	vq->async_buffers_packed = NULL;
	rte_free(vq->async_descs_split);
	vq->async_descs_split = NULL;

Above will just work and will avoid adding dev parameter.


>  		rte_free(vq->async_descs_split);
> -	if (vq->it_pool)
> -		rte_free(vq->it_pool);
> -	if (vq->vec_pool)
> -		rte_free(vq->vec_pool);
> +		vq->async_descs_split = NULL;
> +	}
> +
> +	rte_free(vq->it_pool);
> +	rte_free(vq->vec_pool);
>  
>  	vq->async_pkts_info = NULL;
> -	vq->async_descs_split = NULL;
>  	vq->it_pool = NULL;
>  	vq->vec_pool = NULL;
>  }
> @@ -360,10 +363,10 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
>  	if (vq_is_packed(dev))
>  		rte_free(vq->shadow_used_packed);
> -	else {
> +	else
>  		rte_free(vq->shadow_used_split);
> -		vhost_free_async_mem(vq);
> -	}
> +
> +	vhost_free_async_mem(dev, vq);
>  	rte_free(vq->batch_copy_elems);
>  	if (vq->iotlb_pool)
>  		rte_mempool_free(vq->iotlb_pool);
> @@ -1626,10 +1629,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  	if (unlikely(vq == NULL || !dev->async_copy))
>  		return -1;
>  
> -	/* packed queue is not supported */
> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> +	if (unlikely(!f.async_inorder)) {
>  		VHOST_LOG_CONFIG(ERR,
> -			"async copy is not supported on packed queue or non-inorder mode "
> +			"async copy is not supported on non-inorder mode "
>  			"(vid %d, qid: %d)\n", vid, queue_id);
>  		return -1;
>  	}
> @@ -1667,12 +1669,19 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  	vq->vec_pool = rte_malloc_socket(NULL,
>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
>  			RTE_CACHE_LINE_SIZE, node);
> -	vq->async_descs_split = rte_malloc_socket(NULL,
> +	if (vq_is_packed(dev)) {
> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> +			vq->size * sizeof(struct vring_used_elem_packed),
> +			RTE_CACHE_LINE_SIZE, node);
> +	} else {
> +		vq->async_descs_split = rte_malloc_socket(NULL,
>  			vq->size * sizeof(struct vring_used_elem),
>  			RTE_CACHE_LINE_SIZE, node);
> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> -		!vq->it_pool || !vq->vec_pool) {
> -		vhost_free_async_mem(vq);
> +	}
> +
> +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
> +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {

Not really than of this error handling. Checking after every malloc if
it suceed would be cleaner.

> +		vhost_free_async_mem(dev, vq);
>  		VHOST_LOG_CONFIG(ERR,
>  				"async register failed: cannot allocate memory for vq data "
>  				"(vid %d, qid: %d)\n", vid, queue_id);
> @@ -1728,7 +1737,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>  		goto out;
>  	}
>  
> -	vhost_free_async_mem(vq);
> +	vhost_free_async_mem(dev, vq);
>  
>  	vq->async_ops.transfer_data = NULL;
>  	vq->async_ops.check_completed_copies = NULL;
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index f628714c2..673335217 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -201,9 +201,18 @@ struct vhost_virtqueue {
>  	uint16_t	async_pkts_idx;
>  	uint16_t	async_pkts_inflight_n;
>  	uint16_t	async_last_pkts_n;
> -	struct vring_used_elem  *async_descs_split;
> -	uint16_t async_desc_idx;
> -	uint16_t last_async_desc_idx;
> +	union {
> +		struct vring_used_elem  *async_descs_split;
> +		struct vring_used_elem_packed *async_buffers_packed;
> +	};
> +	union {
> +		uint16_t async_desc_idx;
> +		uint16_t async_packed_buffer_idx;
> +	};
> +	union {
> +		uint16_t last_async_desc_idx;
> +		uint16_t last_async_buffer_idx;
> +	};

Looks almost good to me now, thanks for doing the change.
Only minor issue is the naming which is not consistent in the different
fields. Sometimes it contains split or packed, sometimes not.

Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost
  2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
  2021-04-14 13:40     ` Maxime Coquelin
@ 2021-04-15  2:02     ` Hu, Jiayu
  2021-04-15  5:54       ` Jiang, Cheng1
  1 sibling, 1 reply; 60+ messages in thread
From: Hu, Jiayu @ 2021-04-15  2:02 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Cheng,

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Wednesday, April 14, 2021 2:14 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v7 2/4] vhost: add support for packed ring in async vhost
> 
> For now async vhost data path only supports split ring. This patch
> enables packed ring in async vhost data path to make async vhost
> compatible with virtio 1.1 spec.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |   1 +
>  lib/librte_vhost/vhost.c           |  49 ++--
>  lib/librte_vhost/vhost.h           |  15 +-
>  lib/librte_vhost/virtio_net.c      | 432 +++++++++++++++++++++++++++--
>  4 files changed, 456 insertions(+), 41 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost_async.h
> b/lib/librte_vhost/rte_vhost_async.h
> index c855ff875..6faa31f5a 100644
> --- a/lib/librte_vhost/rte_vhost_async.h
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
>  struct async_inflight_info {
>  	struct rte_mbuf *mbuf;
>  	uint16_t descs; /* num of descs inflight */
> +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>  };
> 
>  /**
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index a70fe01d8..f509186c6 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -338,19 +338,22 @@ cleanup_device(struct virtio_net *dev, int destroy)
>  }
> 
>  static void
> -vhost_free_async_mem(struct vhost_virtqueue *vq)
> +vhost_free_async_mem(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> -	if (vq->async_pkts_info)
> -		rte_free(vq->async_pkts_info);
> -	if (vq->async_descs_split)
> +	rte_free(vq->async_pkts_info);
> +
> +	if (vq_is_packed(dev)) {
> +		rte_free(vq->async_buffers_packed);
> +		vq->async_buffers_packed = NULL;
> +	} else {
>  		rte_free(vq->async_descs_split);
> -	if (vq->it_pool)
> -		rte_free(vq->it_pool);
> -	if (vq->vec_pool)
> -		rte_free(vq->vec_pool);
> +		vq->async_descs_split = NULL;
> +	}
> +
> +	rte_free(vq->it_pool);
> +	rte_free(vq->vec_pool);
> 
>  	vq->async_pkts_info = NULL;
> -	vq->async_descs_split = NULL;
>  	vq->it_pool = NULL;
>  	vq->vec_pool = NULL;
>  }
> @@ -360,10 +363,10 @@ free_vq(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>  {
>  	if (vq_is_packed(dev))
>  		rte_free(vq->shadow_used_packed);
> -	else {
> +	else
>  		rte_free(vq->shadow_used_split);
> -		vhost_free_async_mem(vq);
> -	}
> +
> +	vhost_free_async_mem(dev, vq);
>  	rte_free(vq->batch_copy_elems);
>  	if (vq->iotlb_pool)
>  		rte_mempool_free(vq->iotlb_pool);
> @@ -1626,10 +1629,9 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
>  	if (unlikely(vq == NULL || !dev->async_copy))
>  		return -1;
> 
> -	/* packed queue is not supported */
> -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> +	if (unlikely(!f.async_inorder)) {
>  		VHOST_LOG_CONFIG(ERR,
> -			"async copy is not supported on packed queue or
> non-inorder mode "
> +			"async copy is not supported on non-inorder mode "
>  			"(vid %d, qid: %d)\n", vid, queue_id);
>  		return -1;
>  	}
> @@ -1667,12 +1669,19 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
>  	vq->vec_pool = rte_malloc_socket(NULL,
>  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
>  			RTE_CACHE_LINE_SIZE, node);
> -	vq->async_descs_split = rte_malloc_socket(NULL,
> +	if (vq_is_packed(dev)) {
> +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> +			vq->size * sizeof(struct vring_used_elem_packed),
> +			RTE_CACHE_LINE_SIZE, node);
> +	} else {
> +		vq->async_descs_split = rte_malloc_socket(NULL,
>  			vq->size * sizeof(struct vring_used_elem),
>  			RTE_CACHE_LINE_SIZE, node);
> -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> -		!vq->it_pool || !vq->vec_pool) {
> -		vhost_free_async_mem(vq);
> +	}
> +
> +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
async_buffers_packed and async_descs_split are two members of a union.
Like the way processed in vhost_free_async_mem(), do you think it's better
to check if they are NULL in if-else respectively?

> +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
> +		vhost_free_async_mem(dev, vq);
>  		VHOST_LOG_CONFIG(ERR,
>  				"async register failed: cannot allocate
> memory for vq data "
>  				"(vid %d, qid: %d)\n", vid, queue_id);
> @@ -1728,7 +1737,7 @@ int rte_vhost_async_channel_unregister(int vid,
> uint16_t queue_id)
>  		goto out;
>  	}
> 
> -	vhost_free_async_mem(vq);
> +	vhost_free_async_mem(dev, vq);
> 
>  	vq->async_ops.transfer_data = NULL;
>  	vq->async_ops.check_completed_copies = NULL;
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index f628714c2..673335217 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -201,9 +201,18 @@ struct vhost_virtqueue {
>  	uint16_t	async_pkts_idx;
>  	uint16_t	async_pkts_inflight_n;
>  	uint16_t	async_last_pkts_n;
> -	struct vring_used_elem  *async_descs_split;
> -	uint16_t async_desc_idx;
> -	uint16_t last_async_desc_idx;
> +	union {
> +		struct vring_used_elem  *async_descs_split;
> +		struct vring_used_elem_packed *async_buffers_packed;
> +	};
> +	union {
> +		uint16_t async_desc_idx;
> +		uint16_t async_packed_buffer_idx;
> +	};
> +	union {
> +		uint16_t last_async_desc_idx;
> +		uint16_t last_async_buffer_idx;
> +	};
> 
>  	/* vq async features */
>  	bool		async_inorder;
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 438bdafd1..54e11e3a5 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -363,14 +363,14 @@
> vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
>  }
> 
>  static __rte_always_inline void
> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> -				   struct vhost_virtqueue *vq,
> -				   uint32_t len[],
> -				   uint16_t id[],
> -				   uint16_t count[],
> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> +				   uint32_t *len,
> +				   uint16_t *id,
> +				   uint16_t *count,
>  				   uint16_t num_buffers)
>  {
>  	uint16_t i;
> +
>  	for (i = 0; i < num_buffers; i++) {
>  		/* enqueue shadow flush action aligned with batch num */
>  		if (!vq->shadow_used_idx)
> @@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct
> virtio_net *dev,
>  		vq->shadow_aligned_idx += count[i];
>  		vq->shadow_used_idx++;
>  	}
> +}
> +
> +static __rte_always_inline void
> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> +				   struct vhost_virtqueue *vq,
> +				   uint32_t *len,
> +				   uint16_t *id,
> +				   uint16_t *count,
> +				   uint16_t num_buffers)
> +{
> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> 
>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
>  		do_data_copy_enqueue(dev, vq);
> @@ -1474,6 +1485,23 @@ store_dma_desc_info_split(struct
> vring_used_elem *s_ring, struct vring_used_elem
>  	}
>  }
> 
> +static __rte_always_inline void
> +store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
> +		struct vring_used_elem_packed *d_ring,
> +		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t
> count)
> +{
> +	uint16_t elem_size = sizeof(struct vring_used_elem_packed);
> +
> +	if (d_idx + count <= ring_size) {
> +		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count *
> elem_size);
> +	} else {
> +		uint16_t size = ring_size - d_idx;
> +
> +		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
> +		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) *
> elem_size);
> +	}
> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1641,6 +1669,330 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	return pkt_idx;
>  }
> 
> +static __rte_always_inline void
> +vhost_update_used_packed(struct vhost_virtqueue *vq,
> +			struct vring_used_elem_packed *shadow_ring,
> +			uint16_t count)
> +{
> +	int i;
> +	uint16_t used_idx = vq->last_used_idx;
> +	uint16_t head_idx = vq->last_used_idx;
> +	uint16_t head_flags = 0;
> +
> +	if (count == 0)
> +		return;
> +
> +	/* Split loop in two to save memory barriers */
> +	for (i = 0; i < count; i++) {
> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> +
> +		used_idx += shadow_ring[i].count;
> +		if (used_idx >= vq->size)
> +			used_idx -= vq->size;
> +	}
> +
> +	/* The ordering for storing desc flags needs to be enforced. */
> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> +
> +	for (i = 0; i < count; i++) {
> +		uint16_t flags;
> +
> +		if (vq->shadow_used_packed[i].len)
> +			flags = VRING_DESC_F_WRITE;
> +		else
> +			flags = 0;
> +
> +		if (vq->used_wrap_counter) {
> +			flags |= VRING_DESC_F_USED;
> +			flags |= VRING_DESC_F_AVAIL;
> +		} else {
> +			flags &= ~VRING_DESC_F_USED;
> +			flags &= ~VRING_DESC_F_AVAIL;
> +		}
> +
> +		if (i > 0) {
> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> +
No need a blank line above.

> +		} else {
> +			head_idx = vq->last_used_idx;
> +			head_flags = flags;
> +		}
> +
> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> +	}
> +
> +	vq->desc_packed[head_idx].flags = head_flags;
> +}
> +
> +static __rte_always_inline int
> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    struct buf_vector *buf_vec,
> +			    uint16_t *nr_descs,
> +			    uint16_t *nr_buffers,
> +			    struct vring_packed_desc *async_descs,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	uint16_t nr_vec = 0;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint16_t max_tries, tries = 0;
> +	uint16_t buf_id = 0;
> +	uint32_t len = 0;
> +	uint16_t desc_count = 0;
> +	uint32_t size = pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> +	uint32_t buffer_len[vq->size];
> +	uint16_t buffer_buf_id[vq->size];
> +	uint16_t buffer_desc_count[vq->size];
> +	*nr_buffers = 0;
> +
> +	if (rxvq_is_mergeable(dev))
> +		max_tries = vq->size - 1;
> +	else
> +		max_tries = 1;
> +
> +	while (size > 0) {
> +		/*
> +		 * if we tried all available ring items, and still
> +		 * can't get enough buf, it means something abnormal
> +		 * happened.
> +		 */
> +		if (unlikely(++tries > max_tries))
> +			return -1;
> +
> +		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx,
> &desc_count, buf_vec, &nr_vec,
> +						&buf_id, &len,
> VHOST_ACCESS_RW) < 0))
> +			return -1;
> +
> +		len = RTE_MIN(len, size);
> +		size -= len;
> +
> +		buffer_len[*nr_buffers] = len;
> +		buffer_buf_id[*nr_buffers] = buf_id;
> +		buffer_desc_count[*nr_buffers] = desc_count;
> +		*nr_buffers += 1;
> +
> +		*nr_descs += desc_count;
> +		avail_idx += desc_count;
> +		if (avail_idx >= vq->size)
> +			avail_idx -= vq->size;
> +	}
> +
> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> src_iovec, dst_iovec,
> +			src_it, dst_it) < 0)
> +		return -1;
> +	/* store descriptors for DMA */
> +	if (avail_idx >= *nr_descs) {
> +		rte_memcpy(async_descs, &vq->desc_packed[vq-
> >last_avail_idx],
> +			*nr_descs * sizeof(struct vring_packed_desc));
> +	} else {
> +		uint16_t nr_copy = vq->size - vq->last_avail_idx;
> +		rte_memcpy(async_descs, &vq->desc_packed[vq-
> >last_avail_idx],
> +			nr_copy * sizeof(struct vring_packed_desc));
> +		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
> +			(*nr_descs - nr_copy) * sizeof(struct
> vring_packed_desc));
> +	}
> +
> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> buffer_desc_count, *nr_buffers);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline int16_t
> +virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t
> *nr_buffers,
> +			    struct vring_packed_desc *async_descs,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it, struct
> rte_vhost_iov_iter *dst_it)
> +{
> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> +	*nr_descs = 0;
> +	*nr_buffers = 0;
> +
> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> buf_vec, nr_descs, nr_buffers,
> +						 async_descs, src_iovec,
> dst_iovec,
> +						 src_it, dst_it) < 0)) {
> +		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc
> from vring\n", dev->vid);
> +		return -1;
> +	}
> +
> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> +			dev->vid, vq->last_avail_idx, vq->last_avail_idx +
> *nr_descs);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline void
> +dma_error_handler_packed(struct vhost_virtqueue *vq, struct
> vring_packed_desc *async_descs,
> +			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t
> nr_err,
> +			uint32_t *pkt_idx, uint32_t *num_async_pkts,
> uint32_t *num_done_pkts)
> +{
> +	uint16_t descs_err = 0;
> +	uint16_t buffers_err = 0;
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> +
> +	*num_async_pkts -= nr_err;
> +	*pkt_idx -= nr_err;
> +	/* calculate the sum of buffers and descs of DMA-error packets. */
> +	while (nr_err-- > 0) {
> +		descs_err += pkts_info[slot_idx % vq->size].descs;
I notice there are several parts using "%" to wrap around index, but
existed code uses "& (vq->size - 1)" instead. I think it's better to keep
it consistent.

Thanks,
Jiayu

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost
  2021-04-14 13:40     ` Maxime Coquelin
@ 2021-04-15  5:42       ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-15  5:42 UTC (permalink / raw)
  To: Maxime Coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Wednesday, April 14, 2021 9:41 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>
> Subject: Re: [PATCH v7 2/4] vhost: add support for packed ring in async vhost
> 
> 
> 
> On 4/14/21 8:13 AM, Cheng Jiang wrote:
> > For now async vhost data path only supports split ring. This patch
> > enables packed ring in async vhost data path to make async vhost
> > compatible with virtio 1.1 spec.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> >  lib/librte_vhost/rte_vhost_async.h |   1 +
> >  lib/librte_vhost/vhost.c           |  49 ++--
> >  lib/librte_vhost/vhost.h           |  15 +-
> >  lib/librte_vhost/virtio_net.c      | 432 +++++++++++++++++++++++++++--
> >  4 files changed, 456 insertions(+), 41 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > b/lib/librte_vhost/rte_vhost_async.h
> > index c855ff875..6faa31f5a 100644
> > --- a/lib/librte_vhost/rte_vhost_async.h
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
> > async_inflight_info {
> >  	struct rte_mbuf *mbuf;
> >  	uint16_t descs; /* num of descs inflight */
> > +	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >  };
> >
> >  /**
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > a70fe01d8..f509186c6 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -338,19 +338,22 @@ cleanup_device(struct virtio_net *dev, int
> > destroy)  }
> >
> >  static void
> > -vhost_free_async_mem(struct vhost_virtqueue *vq)
> > +vhost_free_async_mem(struct virtio_net *dev, struct vhost_virtqueue
> > +*vq)
> >  {
> > -	if (vq->async_pkts_info)
> > -		rte_free(vq->async_pkts_info);
> > -	if (vq->async_descs_split)
> > +	rte_free(vq->async_pkts_info);
> > +
> > +	if (vq_is_packed(dev)) {
> > +		rte_free(vq->async_buffers_packed);
> > +		vq->async_buffers_packed = NULL;
> > +	} else {
> 
> Doing this is not necessary:
> 
> 	rte_free(vq->async_buffers_packed);
> 	vq->async_buffers_packed = NULL;
> 	rte_free(vq->async_descs_split);
> 	vq->async_descs_split = NULL;
> 
> Above will just work and will avoid adding dev parameter.

async_buffers_packed and async_descs_split are two members of a union.
If I rte_free(vq->async_buffers_packed), then there is no need to rte_free(vq->async_descs_split).
So I use dev to determine which one I need to free.
Sure, I can free then both with no condition(I can also only free any one of them), but I think it's a little strange, Right?

> 
> 
> >  		rte_free(vq->async_descs_split);
> > -	if (vq->it_pool)
> > -		rte_free(vq->it_pool);
> > -	if (vq->vec_pool)
> > -		rte_free(vq->vec_pool);
> > +		vq->async_descs_split = NULL;
> > +	}
> > +
> > +	rte_free(vq->it_pool);
> > +	rte_free(vq->vec_pool);
> >
> >  	vq->async_pkts_info = NULL;
> > -	vq->async_descs_split = NULL;
> >  	vq->it_pool = NULL;
> >  	vq->vec_pool = NULL;
> >  }
> > @@ -360,10 +363,10 @@ free_vq(struct virtio_net *dev, struct
> > vhost_virtqueue *vq)  {
> >  	if (vq_is_packed(dev))
> >  		rte_free(vq->shadow_used_packed);
> > -	else {
> > +	else
> >  		rte_free(vq->shadow_used_split);
> > -		vhost_free_async_mem(vq);
> > -	}
> > +
> > +	vhost_free_async_mem(dev, vq);
> >  	rte_free(vq->batch_copy_elems);
> >  	if (vq->iotlb_pool)
> >  		rte_mempool_free(vq->iotlb_pool);
> > @@ -1626,10 +1629,9 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
> >  	if (unlikely(vq == NULL || !dev->async_copy))
> >  		return -1;
> >
> > -	/* packed queue is not supported */
> > -	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> > +	if (unlikely(!f.async_inorder)) {
> >  		VHOST_LOG_CONFIG(ERR,
> > -			"async copy is not supported on packed queue or
> non-inorder mode "
> > +			"async copy is not supported on non-inorder mode "
> >  			"(vid %d, qid: %d)\n", vid, queue_id);
> >  		return -1;
> >  	}
> > @@ -1667,12 +1669,19 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
> >  	vq->vec_pool = rte_malloc_socket(NULL,
> >  			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
> >  			RTE_CACHE_LINE_SIZE, node);
> > -	vq->async_descs_split = rte_malloc_socket(NULL,
> > +	if (vq_is_packed(dev)) {
> > +		vq->async_buffers_packed = rte_malloc_socket(NULL,
> > +			vq->size * sizeof(struct vring_used_elem_packed),
> > +			RTE_CACHE_LINE_SIZE, node);
> > +	} else {
> > +		vq->async_descs_split = rte_malloc_socket(NULL,
> >  			vq->size * sizeof(struct vring_used_elem),
> >  			RTE_CACHE_LINE_SIZE, node);
> > -	if (!vq->async_descs_split || !vq->async_pkts_info ||
> > -		!vq->it_pool || !vq->vec_pool) {
> > -		vhost_free_async_mem(vq);
> > +	}
> > +
> > +	if (!vq->async_buffers_packed || !vq->async_descs_split ||
> > +		!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
> 
> Not really than of this error handling. Checking after every malloc if it suceed
> would be cleaner.

OK, make sense to me. But that means I need to add log(which can be more specific) after every check, do you thinks it's ok?
If you thinks it's ok, then I'll fix it in the next version.

> 
> > +		vhost_free_async_mem(dev, vq);
> >  		VHOST_LOG_CONFIG(ERR,
> >  				"async register failed: cannot allocate
> memory for vq data "
> >  				"(vid %d, qid: %d)\n", vid, queue_id); @@ -
> 1728,7 +1737,7 @@ int
> > rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
> >  		goto out;
> >  	}
> >
> > -	vhost_free_async_mem(vq);
> > +	vhost_free_async_mem(dev, vq);
> >
> >  	vq->async_ops.transfer_data = NULL;
> >  	vq->async_ops.check_completed_copies = NULL; diff --git
> > a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > f628714c2..673335217 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -201,9 +201,18 @@ struct vhost_virtqueue {
> >  	uint16_t	async_pkts_idx;
> >  	uint16_t	async_pkts_inflight_n;
> >  	uint16_t	async_last_pkts_n;
> > -	struct vring_used_elem  *async_descs_split;
> > -	uint16_t async_desc_idx;
> > -	uint16_t last_async_desc_idx;
> > +	union {
> > +		struct vring_used_elem  *async_descs_split;
> > +		struct vring_used_elem_packed *async_buffers_packed;
> > +	};
> > +	union {
> > +		uint16_t async_desc_idx;
> > +		uint16_t async_packed_buffer_idx;
> > +	};
> > +	union {
> > +		uint16_t last_async_desc_idx;
> > +		uint16_t last_async_buffer_idx;
> > +	};
> 
> Looks almost good to me now, thanks for doing the change.
> Only minor issue is the naming which is not consistent in the different fields.
> Sometimes it contains split or packed, sometimes not.

OK, I'll check these names, and fix them.

Thanks a lot.
Cheng

> 
> Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost
  2021-04-15  2:02     ` Hu, Jiayu
@ 2021-04-15  5:54       ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-15  5:54 UTC (permalink / raw)
  To: Hu, Jiayu, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Jiayu,

> -----Original Message-----
> From: Hu, Jiayu <jiayu.hu@intel.com>
> Sent: Thursday, April 15, 2021 10:03 AM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Yang, YvonneX <yvonnex.yang@intel.com>; Wang, Yinan
> <yinan.wang@intel.com>; Liu, Yong <yong.liu@intel.com>
> Subject: RE: [PATCH v7 2/4] vhost: add support for packed ring in async vhost
> 
> Hi Cheng,
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Wednesday, April 14, 2021 2:14 PM
> > To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> > <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> > Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Subject: [PATCH v7 2/4] vhost: add support for packed ring in async
> > vhost
> >
> > For now async vhost data path only supports split ring. This patch
> > enables packed ring in async vhost data path to make async vhost
> > compatible with virtio 1.1 spec.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> >  lib/librte_vhost/rte_vhost_async.h |   1 +
> >  lib/librte_vhost/vhost.c           |  49 ++--
> >  lib/librte_vhost/vhost.h           |  15 +-
> >  lib/librte_vhost/virtio_net.c      | 432 +++++++++++++++++++++++++++--
> >  4 files changed, 456 insertions(+), 41 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > b/lib/librte_vhost/rte_vhost_async.h
> > index c855ff875..6faa31f5a 100644
> > --- a/lib/librte_vhost/rte_vhost_async.h
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {  struct
> > async_inflight_info {  struct rte_mbuf *mbuf;  uint16_t descs; /* num
> > of descs inflight */
> > +uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >  };
> >
> >  /**
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > a70fe01d8..f509186c6 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -338,19 +338,22 @@ cleanup_device(struct virtio_net *dev, int
> > destroy)  }
> >
> >  static void
> > -vhost_free_async_mem(struct vhost_virtqueue *vq)
> > +vhost_free_async_mem(struct virtio_net *dev, struct vhost_virtqueue
> > +*vq)
> >  {
> > -if (vq->async_pkts_info)
> > -rte_free(vq->async_pkts_info);
> > -if (vq->async_descs_split)
> > +rte_free(vq->async_pkts_info);
> > +
> > +if (vq_is_packed(dev)) {
> > +rte_free(vq->async_buffers_packed);
> > +vq->async_buffers_packed = NULL;
> > +} else {
> >  rte_free(vq->async_descs_split);
> > -if (vq->it_pool)
> > -rte_free(vq->it_pool);
> > -if (vq->vec_pool)
> > -rte_free(vq->vec_pool);
> > +vq->async_descs_split = NULL;
> > +}
> > +
> > +rte_free(vq->it_pool);
> > +rte_free(vq->vec_pool);
> >
> >  vq->async_pkts_info = NULL;
> > -vq->async_descs_split = NULL;
> >  vq->it_pool = NULL;
> >  vq->vec_pool = NULL;
> >  }
> > @@ -360,10 +363,10 @@ free_vq(struct virtio_net *dev, struct
> > vhost_virtqueue *vq)  {  if (vq_is_packed(dev))
> > rte_free(vq->shadow_used_packed); -else {
> > +else
> >  rte_free(vq->shadow_used_split);
> > -vhost_free_async_mem(vq);
> > -}
> > +
> > +vhost_free_async_mem(dev, vq);
> >  rte_free(vq->batch_copy_elems);
> >  if (vq->iotlb_pool)
> >  rte_mempool_free(vq->iotlb_pool);
> > @@ -1626,10 +1629,9 @@ int rte_vhost_async_channel_register(int vid,
> > uint16_t queue_id,  if (unlikely(vq == NULL || !dev->async_copy))
> > return -1;
> >
> > -/* packed queue is not supported */
> > -if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> > +if (unlikely(!f.async_inorder)) {
> >  VHOST_LOG_CONFIG(ERR,
> > -"async copy is not supported on packed queue or non-inorder mode "
> > +"async copy is not supported on non-inorder mode "
> >  "(vid %d, qid: %d)\n", vid, queue_id);  return -1;  } @@ -1667,12
> > +1669,19 @@ int rte_vhost_async_channel_register(int vid, uint16_t
> > queue_id,  vq->vec_pool = rte_malloc_socket(NULL,
> VHOST_MAX_ASYNC_VEC
> > * sizeof(struct iovec),  RTE_CACHE_LINE_SIZE, node);
> > -vq->async_descs_split = rte_malloc_socket(NULL,
> > +if (vq_is_packed(dev)) {
> > +vq->async_buffers_packed = rte_malloc_socket(NULL, size *
> > +vq->sizeof(struct vring_used_elem_packed),
> > +RTE_CACHE_LINE_SIZE, node);
> > +} else {
> > +vq->async_descs_split = rte_malloc_socket(NULL,
> >  vq->size * sizeof(struct vring_used_elem),  RTE_CACHE_LINE_SIZE,
> > node); -if (!vq->async_descs_split || !vq->async_pkts_info ||
> > -!vq->it_pool || !vq->vec_pool) { -vhost_free_async_mem(vq);
> > +}
> > +
> > +if (!vq->async_buffers_packed || !vq->async_descs_split ||
> async_buffers_packed and async_descs_split are two members of a union.
> Like the way processed in vhost_free_async_mem(), do you think it's better
> to check if they are NULL in if-else respectively?

Sure, make sense to me. I'll fix them together with other async mem check.

> 
> > +!vq->async_pkts_info || !vq->it_pool || !vq->vec_pool) {
> > +vhost_free_async_mem(dev, vq);
> >  VHOST_LOG_CONFIG(ERR,
> >  "async register failed: cannot allocate memory for vq data "
> >  "(vid %d, qid: %d)\n", vid, queue_id); @@ -1728,7 +1737,7 @@ int
> > rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)  goto
> > out;  }
> >
> > -vhost_free_async_mem(vq);
> > +vhost_free_async_mem(dev, vq);
> >
> >  vq->async_ops.transfer_data = NULL;
> >  vq->async_ops.check_completed_copies = NULL; diff --git
> > a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > f628714c2..673335217 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -201,9 +201,18 @@ struct vhost_virtqueue {  uint16_tasync_pkts_idx;
> > uint16_tasync_pkts_inflight_n;  uint16_tasync_last_pkts_n; -struct
> > vring_used_elem  *async_descs_split; -uint16_t async_desc_idx;
> > -uint16_t last_async_desc_idx;
> > +union {
> > +struct vring_used_elem  *async_descs_split; struct
> > +vring_used_elem_packed *async_buffers_packed; }; union { uint16_t
> > +async_desc_idx; uint16_t async_packed_buffer_idx; }; union { uint16_t
> > +last_async_desc_idx; uint16_t last_async_buffer_idx; };
> >
> >  /* vq async features */
> >  boolasync_inorder;
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 438bdafd1..54e11e3a5 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -363,14 +363,14 @@
> > vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue
> *vq,
> > }
> >
> >  static __rte_always_inline void
> > -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > -   struct vhost_virtqueue *vq,
> > -   uint32_t len[],
> > -   uint16_t id[],
> > -   uint16_t count[],
> > +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> > +   uint32_t *len,
> > +   uint16_t *id,
> > +   uint16_t *count,
> >     uint16_t num_buffers)
> >  {
> >  uint16_t i;
> > +
> >  for (i = 0; i < num_buffers; i++) {
> >  /* enqueue shadow flush action aligned with batch num */  if
> > (!vq->shadow_used_idx) @@ -382,6 +382,17 @@
> > vhost_shadow_enqueue_single_packed(struct
> > virtio_net *dev,
> >  vq->shadow_aligned_idx += count[i];
> >  vq->shadow_used_idx++;
> >  }
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > +   struct vhost_virtqueue *vq,
> > +   uint32_t *len,
> > +   uint16_t *id,
> > +   uint16_t *count,
> > +   uint16_t num_buffers)
> > +{
> > +vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> >
> >  if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
> > do_data_copy_enqueue(dev, vq); @@ -1474,6 +1485,23 @@
> > store_dma_desc_info_split(struct vring_used_elem *s_ring, struct
> > vring_used_elem  }  }
> >
> > +static __rte_always_inline void
> > +store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
> > +struct vring_used_elem_packed *d_ring, uint16_t ring_size, uint16_t
> > +s_idx, uint16_t d_idx, uint16_t
> > count)
> > +{
> > +uint16_t elem_size = sizeof(struct vring_used_elem_packed);
> > +
> > +if (d_idx + count <= ring_size) {
> > +rte_memcpy(d_ring + d_idx, s_ring + s_idx, count *
> > elem_size);
> > +} else {
> > +uint16_t size = ring_size - d_idx;
> > +
> > +rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
> > +rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) *
> > elem_size);
> > +}
> > +}
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,  struct
> > vhost_virtqueue *vq, uint16_t queue_id, @@ -1641,6 +1669,330 @@
> > virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >  return pkt_idx;
> >  }
> >
> > +static __rte_always_inline void
> > +vhost_update_used_packed(struct vhost_virtqueue *vq, struct
> > +vring_used_elem_packed *shadow_ring, uint16_t count) { int i;
> > +uint16_t used_idx = vq->last_used_idx; uint16_t head_idx =
> > +vq->last_used_idx; uint16_t head_flags = 0;
> > +
> > +if (count == 0)
> > +return;
> > +
> > +/* Split loop in two to save memory barriers */ for (i = 0; i <
> > +count; i++) {
> > +vq->desc_packed[used_idx].id = shadow_ring[i].id;
> > +vq->desc_packed[used_idx].len = shadow_ring[i].len;
> > +
> > +used_idx += shadow_ring[i].count;
> > +if (used_idx >= vq->size)
> > +used_idx -= vq->size;
> > +}
> > +
> > +/* The ordering for storing desc flags needs to be enforced. */
> > +rte_atomic_thread_fence(__ATOMIC_RELEASE);
> > +
> > +for (i = 0; i < count; i++) {
> > +uint16_t flags;
> > +
> > +if (vq->shadow_used_packed[i].len)
> > +flags = VRING_DESC_F_WRITE;
> > +else
> > +flags = 0;
> > +
> > +if (vq->used_wrap_counter) {
> > +flags |= VRING_DESC_F_USED;
> > +flags |= VRING_DESC_F_AVAIL;
> > +} else {
> > +flags &= ~VRING_DESC_F_USED;
> > +flags &= ~VRING_DESC_F_AVAIL;
> > +}
> > +
> > +if (i > 0) {
> > +vq->desc_packed[vq->last_used_idx].flags = flags;
> > +
> No need a blank line above.

Sure, thanks.

> 
> > +} else {
> > +head_idx = vq->last_used_idx;
> > +head_flags = flags;
> > +}
> > +
> > +vq_inc_last_used_packed(vq, shadow_ring[i].count); }
> > +
> > +vq->desc_packed[head_idx].flags = head_flags;
> > +}
> > +
> > +static __rte_always_inline int
> > +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> > +    struct vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt,
> > +    struct buf_vector *buf_vec,
> > +    uint16_t *nr_descs,
> > +    uint16_t *nr_buffers,
> > +    struct vring_packed_desc *async_descs,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it,
> > +    struct rte_vhost_iov_iter *dst_it) { uint16_t nr_vec = 0;
> > +uint16_t avail_idx = vq->last_avail_idx; uint16_t max_tries, tries =
> > +0; uint16_t buf_id = 0; uint32_t len = 0; uint16_t desc_count = 0;
> > +uint32_t size = pkt->pkt_len + sizeof(struct
> > virtio_net_hdr_mrg_rxbuf);
> > +uint32_t buffer_len[vq->size];
> > +uint16_t buffer_buf_id[vq->size];
> > +uint16_t buffer_desc_count[vq->size]; *nr_buffers = 0;
> > +
> > +if (rxvq_is_mergeable(dev))
> > +max_tries = vq->size - 1;
> > +else
> > +max_tries = 1;
> > +
> > +while (size > 0) {
> > +/*
> > + * if we tried all available ring items, and still
> > + * can't get enough buf, it means something abnormal
> > + * happened.
> > + */
> > +if (unlikely(++tries > max_tries))
> > +return -1;
> > +
> > +if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx,
> > &desc_count, buf_vec, &nr_vec,
> > +&buf_id, &len,
> > VHOST_ACCESS_RW) < 0))
> > +return -1;
> > +
> > +len = RTE_MIN(len, size);
> > +size -= len;
> > +
> > +buffer_len[*nr_buffers] = len;
> > +buffer_buf_id[*nr_buffers] = buf_id;
> > +buffer_desc_count[*nr_buffers] = desc_count; *nr_buffers += 1;
> > +
> > +*nr_descs += desc_count;
> > +avail_idx += desc_count;
> > +if (avail_idx >= vq->size)
> > +avail_idx -= vq->size;
> > +}
> > +
> > +if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> > src_iovec, dst_iovec,
> > +src_it, dst_it) < 0)
> > +return -1;
> > +/* store descriptors for DMA */
> > +if (avail_idx >= *nr_descs) {
> > +rte_memcpy(async_descs, &vq->desc_packed[vq-
> > >last_avail_idx],
> > +*nr_descs * sizeof(struct vring_packed_desc)); } else { uint16_t
> > +nr_copy = vq->size - vq->last_avail_idx; rte_memcpy(async_descs,
> > +&vq->desc_packed[vq-
> > >last_avail_idx],
> > +nr_copy * sizeof(struct vring_packed_desc)); rte_memcpy(async_descs +
> > +nr_copy, vq->desc_packed, (*nr_descs - nr_copy) * sizeof(struct
> > vring_packed_desc));
> > +}
> > +
> > +vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> > buffer_desc_count, *nr_buffers);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_always_inline int16_t
> > +virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct
> > vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t
> > *nr_buffers,
> > +    struct vring_packed_desc *async_descs,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it, struct
> > rte_vhost_iov_iter *dst_it)
> > +{
> > +struct buf_vector buf_vec[BUF_VECTOR_MAX]; *nr_descs = 0;
> *nr_buffers
> > += 0;
> > +
> > +if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> > buf_vec, nr_descs, nr_buffers,
> > + async_descs, src_iovec,
> > dst_iovec,
> > + src_it, dst_it) < 0)) {
> > +VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc
> > from vring\n", dev->vid);
> > +return -1;
> > +}
> > +
> > +VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> > index %d\n",
> > +dev->vid, vq->last_avail_idx, vq->last_avail_idx +
> > *nr_descs);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_always_inline void
> > +dma_error_handler_packed(struct vhost_virtqueue *vq, struct
> > vring_packed_desc *async_descs,
> > +uint16_t async_descs_idx, uint16_t slot_idx, uint32_t
> > nr_err,
> > +uint32_t *pkt_idx, uint32_t *num_async_pkts,
> > uint32_t *num_done_pkts)
> > +{
> > +uint16_t descs_err = 0;
> > +uint16_t buffers_err = 0;
> > +struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > +
> > +*num_async_pkts -= nr_err;
> > +*pkt_idx -= nr_err;
> > +/* calculate the sum of buffers and descs of DMA-error packets. */
> > +while (nr_err-- > 0) { descs_err += pkts_info[slot_idx %
> > +vq->size].descs;
> I notice there are several parts using "%" to wrap around index, but existed
> code uses "& (vq->size - 1)" instead. I think it's better to keep it consistent.

Unlike split ring, packed ring size is not necessarily a power of 2. So I can't use "& (vq->size - 1) in packed ring.

Thanks,
Cheng

> 
> Thanks,
> Jiayu


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
                   ` (5 preceding siblings ...)
  2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-19  8:51 ` Cheng Jiang
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
                     ` (3 more replies)
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
  7 siblings, 4 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-19  8:51 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
set cleans async split ring codes and enables packed ring in async
vhost data path. Batch datapath is also enabled in async vhost
packed ring.

v8:
 * fix some variable names for consistency
 * clean codes
v7:
 * fix compile issues
 * add argument *dev in vhost_free_async_mem() for ring type decision
v6:
 * fix some typos in commit log
 * improve index usage
 * remove shadow_ring_store()
 * add store_dma_desc_info_split() store_dma_desc_info_packed()
 * remove some checks in vhost_free_async_mem()
 * change index calculation since the size isn't necessarily a power of 2
 * move error handling in a dedicated function
 * clean codes
v5:
 * clean some codes for packed ring datapath
 * fix an index error in shadow_ring_store()
v4:
  * change the patch structure
  * clean code for async split ring
  * reuse some code from split ring
  * change the error handler for DMA-copy packet
  * add check for malloc
  * remove useless code
  * add doc update
v3:
  * fix error handler for DMA-copy packet
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

Cheng Jiang (4):
  vhost: abstract and reorganize async split ring code
  vhost: add support for packed ring in async vhost
  vhost: add batch datapath for async vhost packed ring
  doc: add release note for vhost async packed ring

 doc/guides/rel_notes/release_21_05.rst |   4 +
 lib/librte_vhost/rte_vhost_async.h     |   1 +
 lib/librte_vhost/vhost.c               |  79 +++-
 lib/librte_vhost/vhost.h               |  15 +-
 lib/librte_vhost/virtio_net.c          | 598 +++++++++++++++++++++----
 5 files changed, 587 insertions(+), 110 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code
  2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-19  8:51   ` Cheng Jiang
  2021-04-27  1:19     ` Hu, Jiayu
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-04-19  8:51 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

This patch puts some codes of async vhost split ring into inline
functions to improve the readability of the code. And, it changes
the pointer index style of iterator to make the code more concise.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/librte_vhost/virtio_net.c | 132 +++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ff3987860..438bdafd1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1458,6 +1458,22 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }

+static __rte_always_inline void
+store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1474,10 +1490,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
 	struct iovec *src_iovec = vec_pool;
 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-	struct rte_vhost_iov_iter *src_it = it_pool;
-	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1511,29 +1526,30 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dev->vid, vq->last_avail_idx,
 			vq->last_avail_idx + num_buffers);

-		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
-				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
+				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}

 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (it_pool[it_idx].count) {
 			uint16_t from, to;

-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;

 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1557,10 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			store_dma_desc_info_split(vq->shadow_used_split,
+					vq->async_descs_split, vq->size, from, to, num_buffers);
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1578,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;

@@ -1639,6 +1641,36 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }

+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					nr_copy * sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					size * sizeof(struct vring_used_elem));
+			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
+					(nr_copy - size) * sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1727,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;

 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);

 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost
  2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-19  8:51   ` Cheng Jiang
  2021-04-27  5:16     ` Hu, Jiayu
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-04-19  8:51 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring. This patch
enables packed ring in async vhost data path to make async vhost
compatible with virtio 1.1 spec.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |   1 +
 lib/librte_vhost/vhost.c           |  79 ++++--
 lib/librte_vhost/vhost.h           |  15 +-
 lib/librte_vhost/virtio_net.c      | 441 +++++++++++++++++++++++++++--
 4 files changed, 488 insertions(+), 48 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index c855ff875..6faa31f5a 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index a70fe01d8..2e3f9eb09 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -340,17 +340,17 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
-	if (vq->async_pkts_info)
-		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
-		rte_free(vq->async_descs_split);
-	if (vq->it_pool)
-		rte_free(vq->it_pool);
-	if (vq->vec_pool)
-		rte_free(vq->vec_pool);
+	rte_free(vq->async_pkts_info);
 
-	vq->async_pkts_info = NULL;
+	rte_free(vq->async_buffers_packed);
+	vq->async_buffers_packed = NULL;
+	rte_free(vq->async_descs_split);
 	vq->async_descs_split = NULL;
+
+	rte_free(vq->it_pool);
+	rte_free(vq->vec_pool);
+
+	vq->async_pkts_info = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -360,10 +360,10 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
 	if (vq_is_packed(dev))
 		rte_free(vq->shadow_used_packed);
-	else {
+	else
 		rte_free(vq->shadow_used_split);
-		vhost_free_async_mem(vq);
-	}
+
+	vhost_free_async_mem(vq);
 	rte_free(vq->batch_copy_elems);
 	if (vq->iotlb_pool)
 		rte_mempool_free(vq->iotlb_pool);
@@ -1626,10 +1626,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	if (unlikely(vq == NULL || !dev->async_copy))
 		return -1;
 
-	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1661,24 +1660,60 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_info = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->async_pkts_info) {
+		vhost_free_async_mem(vq);
+		VHOST_LOG_CONFIG(ERR,
+			"async register failed: cannot allocate memory for async_pkts_info "
+			"(vid %d, qid: %d)\n", vid, queue_id);
+		goto reg_out;
+	}
+
 	vq->it_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
 			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->it_pool) {
+		vhost_free_async_mem(vq);
+		VHOST_LOG_CONFIG(ERR,
+			"async register failed: cannot allocate memory for it_pool "
+			"(vid %d, qid: %d)\n", vid, queue_id);
+		goto reg_out;
+	}
+
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
-			vq->size * sizeof(struct vring_used_elem),
-			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
-		!vq->it_pool || !vq->vec_pool) {
+	if (!vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
-				"async register failed: cannot allocate memory for vq data "
-				"(vid %d, qid: %d)\n", vid, queue_id);
+			"async register failed: cannot allocate memory for vec_pool "
+			"(vid %d, qid: %d)\n", vid, queue_id);
 		goto reg_out;
 	}
 
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+		if (!vq->async_buffers_packed) {
+			vhost_free_async_mem(vq);
+			VHOST_LOG_CONFIG(ERR,
+				"async register failed: cannot allocate memory for async buffers "
+				"(vid %d, qid: %d)\n", vid, queue_id);
+			goto reg_out;
+		}
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem),
+			RTE_CACHE_LINE_SIZE, node);
+		if (!vq->async_descs_split) {
+			vhost_free_async_mem(vq);
+			VHOST_LOG_CONFIG(ERR,
+				"async register failed: cannot allocate memory for async descs "
+				"(vid %d, qid: %d)\n", vid, queue_id);
+			goto reg_out;
+		}
+	}
+
 	vq->async_ops.check_completed_copies = ops->check_completed_copies;
 	vq->async_ops.transfer_data = ops->transfer_data;
 
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f628714c2..b30363564 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -201,9 +201,18 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
-	uint16_t async_desc_idx;
-	uint16_t last_async_desc_idx;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
+	union {
+		uint16_t async_desc_idx_split;
+		uint16_t async_buffer_idx_packed;
+	};
+	union {
+		uint16_t last_async_desc_idx_split;
+		uint16_t last_async_buffer_idx_packed;
+	};
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 438bdafd1..ce88ad3c0 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -363,14 +363,14 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
-				   uint32_t len[],
-				   uint16_t id[],
-				   uint16_t count[],
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
 				   uint16_t num_buffers)
 {
 	uint16_t i;
+
 	for (i = 0; i < num_buffers; i++) {
 		/* enqueue shadow flush action aligned with batch num */
 		if (!vq->shadow_used_idx)
@@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1474,6 +1485,23 @@ store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem
 	}
 }
 
+static __rte_always_inline void
+store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
+		struct vring_used_elem_packed *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem_packed);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1556,12 +1584,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 * descriptors.
 			 */
 			from = vq->shadow_used_idx - num_buffers;
-			to = vq->async_desc_idx & (vq->size - 1);
+			to = vq->async_desc_idx_split & (vq->size - 1);
 
 			store_dma_desc_info_split(vq->shadow_used_split,
 					vq->async_descs_split, vq->size, from, to, num_buffers);
 
-			vq->async_desc_idx += num_buffers;
+			vq->async_desc_idx_split += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
@@ -1619,7 +1647,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
 			slot_idx--;
 		}
-		vq->async_desc_idx -= num_descs;
+		vq->async_desc_idx_split -= num_descs;
 		/* recover shadow used ring and available ring */
 		vq->shadow_used_idx -= (vq->last_avail_idx -
 				async_pkts_log[num_async_pkts].last_avail_idx -
@@ -1641,6 +1669,329 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+vhost_update_used_packed(struct vhost_virtqueue *vq,
+			struct vring_used_elem_packed *shadow_ring,
+			uint16_t count)
+{
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	if (count == 0)
+		return;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+}
+
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count = 0;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+	*nr_buffers = 0;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
+						&buf_id, &len, VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
+			src_it, dst_it) < 0)
+		return -1;
+	/* store descriptors for DMA */
+	if (avail_idx >= *nr_descs) {
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			*nr_descs * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			nr_copy * sizeof(struct vring_packed_desc));
+		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
+			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
+	}
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	*nr_descs = 0;
+	*nr_buffers = 0;
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
+						 async_descs, src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_always_inline void
+dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
+			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
+			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
+{
+	uint16_t descs_err = 0;
+	uint16_t buffers_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	*num_async_pkts -= nr_err;
+	*pkt_idx -= nr_err;
+	/* calculate the sum of buffers and descs of DMA-error packets. */
+	while (nr_err-- > 0) {
+		descs_err += pkts_info[slot_idx % vq->size].descs;
+		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
+		slot_idx--;
+	}
+
+	vq->async_buffer_idx_packed -= buffers_err;
+
+	if (vq->last_avail_idx >= descs_err) {
+		vq->last_avail_idx -= descs_err;
+
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy;
+
+		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
+		nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			nr_copy * sizeof(struct vring_packed_desc));
+		descs_err -= nr_copy;
+		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	*num_done_pkts = *pkt_idx - *num_async_pkts;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t async_descs_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_desc;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct vring_packed_desc async_descs[vq->size];
+
+	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
+						&num_desc, &num_buffers,
+						&async_descs[async_descs_idx],
+						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
+			break;
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_desc);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
+		if (it_pool[it_idx].count) {
+			uint16_t from, to;
+
+			async_descs_idx += num_desc;
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
+			pkts_info[slot_idx].descs = num_desc;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_buffer_idx_packed % vq->size;
+			store_dma_desc_info_packed(vq->shadow_used_packed,
+					vq->async_buffers_packed, vq->size, from, to, num_buffers);
+
+			vq->async_buffer_idx_packed += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else {
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+		}
+
+		vq_inc_last_avail_packed(vq, num_desc);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, pkt_burst_idx);
+			iovec_idx = 0;
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err))
+		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
+					&pkt_idx, &num_async_pkts, &num_done_pkts);
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 {
@@ -1649,7 +2000,7 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 	uint16_t to, from;
 
 	do {
-		from = vq->last_async_desc_idx & (vq->size - 1);
+		from = vq->last_async_desc_idx_split & (vq->size - 1);
 		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
 		to = vq->last_used_idx & (vq->size - 1);
 
@@ -1665,18 +2016,41 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 					(nr_copy - size) * sizeof(struct vring_used_elem));
 		}
 
-		vq->last_async_desc_idx += nr_copy;
+		vq->last_async_desc_idx_split += nr_copy;
 		vq->last_used_idx += nr_copy;
 		nr_left -= nr_copy;
 	} while (nr_left > 0);
 }
 
+static __rte_always_inline void
+write_back_completed_descs_packed(struct vhost_virtqueue *vq,
+				uint16_t n_buffers)
+{
+	uint16_t nr_left = n_buffers;
+	uint16_t from, to;
+
+	do {
+		from = vq->last_async_buffer_idx_packed % vq->size;
+		to = (from + nr_left) % vq->size;
+		if (to > from) {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
+			vq->last_async_buffer_idx_packed += nr_left;
+			nr_left = 0;
+		} else {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
+				vq->size - from);
+			vq->last_async_buffer_idx_packed += vq->size - from;
+			nr_left -= vq->size - from;
+		}
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1701,7 +2075,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	rte_spinlock_lock(&vq->access_lock);
 
-	pkts_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_idx = vq->async_pkts_idx % vq->size;
 	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
@@ -1718,21 +2092,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		write_back_completed_descs_split(vq, n_descs);
+		if (vq_is_packed(dev)) {
+			write_back_completed_descs_packed(vq, n_buffers);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			write_back_completed_descs_split(vq, n_descs);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx_packed += n_buffers;
+		else
+			vq->last_async_desc_idx_split += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1773,9 +2167,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v8 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-19  8:51   ` Cheng Jiang
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 4/4] doc: add release note for vhost async " Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-19  8:51 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add batch datapath for async vhost packed ring to improve the
performance of small packet processing.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index ce88ad3c0..0ad289e6e 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1724,6 +1724,29 @@ vhost_update_used_packed(struct vhost_virtqueue *vq,
 	vq->desc_packed[head_idx].flags = head_flags;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
+{
+	uint16_t i;
+	uint32_t cpy_threshold = vq->async_threshold;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
+			return -1;
+	}
+	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			comp_pkts[(*pkt_done)++] = pkts[i];
+
+		return 0;
+	}
+
+	return -1;
+}
+
 static __rte_always_inline int
 vhost_enqueue_async_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
@@ -1874,6 +1897,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t remained = count;
 	uint16_t async_descs_idx = 0;
 	uint16_t num_buffers;
 	uint16_t num_desc;
@@ -1891,9 +1915,17 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
 	struct vring_packed_desc async_descs[vq->size];
 
-	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+	do {
+		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_batch_packed(dev, vq,
+				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
+				pkt_idx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
 						&num_desc, &num_buffers,
 						&async_descs[async_descs_idx],
@@ -1936,6 +1968,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 		}
 
+		pkt_idx++;
+		remained--;
 		vq_inc_last_avail_packed(vq, num_desc);
 
 		/*
@@ -1960,13 +1994,12 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 				 */
 				pkt_err = pkt_burst_idx - n_pkts;
 				pkt_burst_idx = 0;
-				pkt_idx++;
 				break;
 			}
 
 			pkt_burst_idx = 0;
 		}
-	}
+	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v8 4/4] doc: add release note for vhost async packed ring
  2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
                     ` (2 preceding siblings ...)
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-19  8:51   ` Cheng Jiang
  3 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-19  8:51 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add release note for the support of vhost async packed ring.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index 374d6d98e..eb5200669 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -131,6 +131,10 @@ New Features
   * Added command to display Rx queue used descriptor count.
     ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added support for vhost async packed ring data path.**
+
+  Added packed ring support for async vhost.
+
 
 Removed Items
 -------------
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-27  1:19     ` Hu, Jiayu
  0 siblings, 0 replies; 60+ messages in thread
From: Hu, Jiayu @ 2021-04-27  1:19 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Reviewed-by: Jiayu Hu <jiayu.hu@intel.com>

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Monday, April 19, 2021 4:51 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v8 1/4] vhost: abstract and reorganize async split ring code
> 
> This patch puts some codes of async vhost split ring into inline
> functions to improve the readability of the code. And, it changes
> the pointer index style of iterator to make the code more concise.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  lib/librte_vhost/virtio_net.c | 132 +++++++++++++++++-----------------
>  1 file changed, 66 insertions(+), 66 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index ff3987860..438bdafd1 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1458,6 +1458,22 @@ virtio_dev_rx_async_get_info_idx(uint16_t
> pkts_idx,
>  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
>  }
> 
> +static __rte_always_inline void
> +store_dma_desc_info_split(struct vring_used_elem *s_ring, struct
> vring_used_elem *d_ring,
> +		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t
> count)
> +{
> +	uint16_t elem_size = sizeof(struct vring_used_elem);
> +
> +	if (d_idx + count <= ring_size) {
> +		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count *
> elem_size);
> +	} else {
> +		uint16_t size = ring_size - d_idx;
> +
> +		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
> +		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) *
> elem_size);
> +	}
> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1474,10 +1490,9 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
>  	struct iovec *src_iovec = vec_pool;
>  	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> -	struct rte_vhost_iov_iter *src_it = it_pool;
> -	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
>  	uint16_t slot_idx = 0;
>  	uint16_t segs_await = 0;
> +	uint16_t iovec_idx = 0, it_idx = 0;
>  	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>  	uint32_t n_pkts = 0, pkt_err = 0;
>  	uint32_t num_async_pkts = 0, num_done_pkts = 0;
> @@ -1511,29 +1526,30 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  			dev->vid, vq->last_avail_idx,
>  			vq->last_avail_idx + num_buffers);
> 
> -		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
> -				buf_vec, nr_vec, num_buffers,
> -				src_iovec, dst_iovec, src_it, dst_it) < 0) {
> +		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec,
> nr_vec, num_buffers,
> +				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
> +				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
>  			vq->shadow_used_idx -= num_buffers;
>  			break;
>  		}
> 
>  		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
>  			(vq->size - 1);
> -		if (src_it->count) {
> +		if (it_pool[it_idx].count) {
>  			uint16_t from, to;
> 
> -			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
> +			async_fill_desc(&tdes[pkt_burst_idx++],
> +				&it_pool[it_idx], &it_pool[it_idx + 1]);
>  			pkts_info[slot_idx].descs = num_buffers;
>  			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
>  			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
>  			async_pkts_log[num_async_pkts++].last_avail_idx =
>  				vq->last_avail_idx;
> -			src_iovec += src_it->nr_segs;
> -			dst_iovec += dst_it->nr_segs;
> -			src_it += 2;
> -			dst_it += 2;
> -			segs_await += src_it->nr_segs;
> +
> +			iovec_idx += it_pool[it_idx].nr_segs;
> +			it_idx += 2;
> +
> +			segs_await += it_pool[it_idx].nr_segs;
> 
>  			/**
>  			 * recover shadow used ring and keep DMA-occupied
> @@ -1541,23 +1557,10 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  			 */
>  			from = vq->shadow_used_idx - num_buffers;
>  			to = vq->async_desc_idx & (vq->size - 1);
> -			if (num_buffers + to <= vq->size) {
> -				rte_memcpy(&vq->async_descs_split[to],
> -						&vq-
> >shadow_used_split[from],
> -						num_buffers *
> -						sizeof(struct
> vring_used_elem));
> -			} else {
> -				int size = vq->size - to;
> -
> -				rte_memcpy(&vq->async_descs_split[to],
> -						&vq-
> >shadow_used_split[from],
> -						size *
> -						sizeof(struct
> vring_used_elem));
> -				rte_memcpy(vq->async_descs_split,
> -						&vq-
> >shadow_used_split[from +
> -						size], (num_buffers - size) *
> -					   sizeof(struct vring_used_elem));
> -			}
> +
> +			store_dma_desc_info_split(vq->shadow_used_split,
> +					vq->async_descs_split, vq->size, from,
> to, num_buffers);
> +
>  			vq->async_desc_idx += num_buffers;
>  			vq->shadow_used_idx -= num_buffers;
>  		} else
> @@ -1575,10 +1578,9 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  			BUF_VECTOR_MAX))) {
>  			n_pkts = vq->async_ops.transfer_data(dev->vid,
>  					queue_id, tdes, 0, pkt_burst_idx);
> -			src_iovec = vec_pool;
> -			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> -			src_it = it_pool;
> -			dst_it = it_pool + 1;
> +			iovec_idx = 0;
> +			it_idx = 0;
> +
>  			segs_await = 0;
>  			vq->async_pkts_inflight_n += n_pkts;
> 
> @@ -1639,6 +1641,36 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	return pkt_idx;
>  }
> 
> +static __rte_always_inline void
> +write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t
> n_descs)
> +{
> +	uint16_t nr_left = n_descs;
> +	uint16_t nr_copy;
> +	uint16_t to, from;
> +
> +	do {
> +		from = vq->last_async_desc_idx & (vq->size - 1);
> +		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
> +		to = vq->last_used_idx & (vq->size - 1);
> +
> +		if (to + nr_copy <= vq->size) {
> +			rte_memcpy(&vq->used->ring[to], &vq-
> >async_descs_split[from],
> +					nr_copy * sizeof(struct
> vring_used_elem));
> +		} else {
> +			uint16_t size = vq->size - to;
> +
> +			rte_memcpy(&vq->used->ring[to], &vq-
> >async_descs_split[from],
> +					size * sizeof(struct vring_used_elem));
> +			rte_memcpy(&vq->used->ring[0], &vq-
> >async_descs_split[from + size],
> +					(nr_copy - size) * sizeof(struct
> vring_used_elem));
> +		}
> +
> +		vq->last_async_desc_idx += nr_copy;
> +		vq->last_used_idx += nr_copy;
> +		nr_left -= nr_copy;
> +	} while (nr_left > 0);
> +}
> +
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count)
>  {
> @@ -1695,39 +1727,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int
> vid, uint16_t queue_id,
>  	vq->async_pkts_inflight_n -= n_pkts_put;
> 
>  	if (likely(vq->enabled && vq->access_ok)) {
> -		uint16_t nr_left = n_descs;
> -		uint16_t nr_copy;
> -		uint16_t to;
> -
> -		/* write back completed descriptors to used ring */
> -		do {
> -			from = vq->last_async_desc_idx & (vq->size - 1);
> -			nr_copy = nr_left + from <= vq->size ? nr_left :
> -				vq->size - from;
> -			to = vq->last_used_idx & (vq->size - 1);
> -
> -			if (to + nr_copy <= vq->size) {
> -				rte_memcpy(&vq->used->ring[to],
> -						&vq-
> >async_descs_split[from],
> -						nr_copy *
> -						sizeof(struct
> vring_used_elem));
> -			} else {
> -				uint16_t size = vq->size - to;
> -
> -				rte_memcpy(&vq->used->ring[to],
> -						&vq-
> >async_descs_split[from],
> -						size *
> -						sizeof(struct
> vring_used_elem));
> -				rte_memcpy(vq->used->ring,
> -						&vq->async_descs_split[from
> +
> -						size], (nr_copy - size) *
> -						sizeof(struct
> vring_used_elem));
> -			}
> -
> -			vq->last_async_desc_idx += nr_copy;
> -			vq->last_used_idx += nr_copy;
> -			nr_left -= nr_copy;
> -		} while (nr_left > 0);
> +		write_back_completed_descs_split(vq, n_descs);
> 
>  		__atomic_add_fetch(&vq->used->idx, n_descs,
> __ATOMIC_RELEASE);
>  		vhost_vring_call_split(dev, vq);
> --
> 2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost
  2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-27  5:16     ` Hu, Jiayu
  2021-04-27  6:07       ` Jiang, Cheng1
  0 siblings, 1 reply; 60+ messages in thread
From: Hu, Jiayu @ 2021-04-27  5:16 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Cheng,

Some comments are inline.

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Monday, April 19, 2021 4:51 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v8 2/4] vhost: add support for packed ring in async vhost
> 
> For now async vhost data path only supports split ring. This patch
> enables packed ring in async vhost data path to make async vhost
> compatible with virtio 1.1 spec.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |   1 +
>  lib/librte_vhost/vhost.c           |  79 ++++--
>  lib/librte_vhost/vhost.h           |  15 +-
>  lib/librte_vhost/virtio_net.c      | 441 +++++++++++++++++++++++++++--
>  4 files changed, 488 insertions(+), 48 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 438bdafd1..ce88ad3c0 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -363,14 +363,14 @@
> vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
>  }
> 
>  static __rte_always_inline void
> -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> -				   struct vhost_virtqueue *vq,
> -				   uint32_t len[],
> -				   uint16_t id[],
> -				   uint16_t count[],
> +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> +				   uint32_t *len,
> +				   uint16_t *id,
> +				   uint16_t *count,
>  				   uint16_t num_buffers)
>  {
>  	uint16_t i;
> +
>  	for (i = 0; i < num_buffers; i++) {
>  		/* enqueue shadow flush action aligned with batch num */
>  		if (!vq->shadow_used_idx)
> @@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct
> virtio_net *dev,
>  		vq->shadow_aligned_idx += count[i];
>  		vq->shadow_used_idx++;
>  	}
> +}
> +
> +static __rte_always_inline void
> +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> +				   struct vhost_virtqueue *vq,
> +				   uint32_t *len,
> +				   uint16_t *id,
> +				   uint16_t *count,
> +				   uint16_t num_buffers)
> +{
> +	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> 
>  	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
>  		do_data_copy_enqueue(dev, vq);
> @@ -1474,6 +1485,23 @@ store_dma_desc_info_split(struct
> vring_used_elem *s_ring, struct vring_used_elem
>  	}
>  }
> 
> +static __rte_always_inline void
> +store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
> +		struct vring_used_elem_packed *d_ring,
> +		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t
> count)
> +{
> +	uint16_t elem_size = sizeof(struct vring_used_elem_packed);
> +
> +	if (d_idx + count <= ring_size) {
> +		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count *
> elem_size);
> +	} else {
> +		uint16_t size = ring_size - d_idx;
> +
> +		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
> +		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) *
> elem_size);
> +	}
> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1556,12 +1584,12 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  			 * descriptors.
>  			 */
>  			from = vq->shadow_used_idx - num_buffers;
> -			to = vq->async_desc_idx & (vq->size - 1);
> +			to = vq->async_desc_idx_split & (vq->size - 1);
> 
>  			store_dma_desc_info_split(vq->shadow_used_split,
>  					vq->async_descs_split, vq->size, from,
> to, num_buffers);
> 
> -			vq->async_desc_idx += num_buffers;
> +			vq->async_desc_idx_split += num_buffers;
>  			vq->shadow_used_idx -= num_buffers;
>  		} else
>  			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
> @@ -1619,7 +1647,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net
> *dev,
>  			num_descs += pkts_info[slot_idx & (vq->size -
> 1)].descs;
>  			slot_idx--;
>  		}
> -		vq->async_desc_idx -= num_descs;
> +		vq->async_desc_idx_split -= num_descs;
>  		/* recover shadow used ring and available ring */
>  		vq->shadow_used_idx -= (vq->last_avail_idx -
> 
> 	async_pkts_log[num_async_pkts].last_avail_idx -
> @@ -1641,6 +1669,329 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
>  	return pkt_idx;
>  }
> 
> +static __rte_always_inline void
> +vhost_update_used_packed(struct vhost_virtqueue *vq,
> +			struct vring_used_elem_packed *shadow_ring,
> +			uint16_t count)
> +{
> +	int i;
> +	uint16_t used_idx = vq->last_used_idx;
> +	uint16_t head_idx = vq->last_used_idx;
> +	uint16_t head_flags = 0;
> +
> +	if (count == 0)
> +		return;
> +
> +	/* Split loop in two to save memory barriers */
> +	for (i = 0; i < count; i++) {
> +		vq->desc_packed[used_idx].id = shadow_ring[i].id;
> +		vq->desc_packed[used_idx].len = shadow_ring[i].len;
> +
> +		used_idx += shadow_ring[i].count;
> +		if (used_idx >= vq->size)
> +			used_idx -= vq->size;
> +	}
> +
> +	/* The ordering for storing desc flags needs to be enforced. */
> +	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> +
> +	for (i = 0; i < count; i++) {
> +		uint16_t flags;
> +
> +		if (vq->shadow_used_packed[i].len)
> +			flags = VRING_DESC_F_WRITE;
> +		else
> +			flags = 0;
> +
> +		if (vq->used_wrap_counter) {
> +			flags |= VRING_DESC_F_USED;
> +			flags |= VRING_DESC_F_AVAIL;
> +		} else {
> +			flags &= ~VRING_DESC_F_USED;
> +			flags &= ~VRING_DESC_F_AVAIL;
> +		}
> +
> +		if (i > 0) {
> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> +		} else {
> +			head_idx = vq->last_used_idx;
> +			head_flags = flags;
> +		}
> +
> +		vq_inc_last_used_packed(vq, shadow_ring[i].count);
> +	}
> +
> +	vq->desc_packed[head_idx].flags = head_flags;
> +}
> +
> +static __rte_always_inline int
> +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> +			    struct vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt,
> +			    struct buf_vector *buf_vec,
> +			    uint16_t *nr_descs,
> +			    uint16_t *nr_buffers,
> +			    struct vring_packed_desc *async_descs,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it,
> +			    struct rte_vhost_iov_iter *dst_it)
> +{
> +	uint16_t nr_vec = 0;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint16_t max_tries, tries = 0;
> +	uint16_t buf_id = 0;
> +	uint32_t len = 0;
> +	uint16_t desc_count = 0;
> +	uint32_t size = pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> +	uint32_t buffer_len[vq->size];
> +	uint16_t buffer_buf_id[vq->size];
> +	uint16_t buffer_desc_count[vq->size];
> +	*nr_buffers = 0;
nr_buffers and nr_descs are pointers of num_buffers and num_desc in
virtio_dev_rx_async_submit_packed(), and num_buffers and num_desc
don't have init values. I think you need to init them before pass their
pointers to another function, as it will read/update values pointed by the
pointers.

In addition, *nr_buffers is set to 0, but *nr_descs is not, and both of them are set to
0 in virtio_dev_rx_async_single_packed(). It looks strange.

> +
> +	if (rxvq_is_mergeable(dev))
> +		max_tries = vq->size - 1;
> +	else
> +		max_tries = 1;
> +
> +	while (size > 0) {
> +		/*
> +		 * if we tried all available ring items, and still
> +		 * can't get enough buf, it means something abnormal
> +		 * happened.
> +		 */
> +		if (unlikely(++tries > max_tries))
> +			return -1;
> +
> +		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx,
> &desc_count, buf_vec, &nr_vec,
> +						&buf_id, &len,
> VHOST_ACCESS_RW) < 0))
> +			return -1;
> +
> +		len = RTE_MIN(len, size);
> +		size -= len;
> +
> +		buffer_len[*nr_buffers] = len;
> +		buffer_buf_id[*nr_buffers] = buf_id;
> +		buffer_desc_count[*nr_buffers] = desc_count;
> +		*nr_buffers += 1;
> +
> +		*nr_descs += desc_count;
> +		avail_idx += desc_count;
> +		if (avail_idx >= vq->size)
> +			avail_idx -= vq->size;
> +	}
> +
> +	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> src_iovec, dst_iovec,
> +			src_it, dst_it) < 0)
> +		return -1;
> +	/* store descriptors for DMA */
> +	if (avail_idx >= *nr_descs) {
> +		rte_memcpy(async_descs, &vq->desc_packed[vq-
> >last_avail_idx],
> +			*nr_descs * sizeof(struct vring_packed_desc));
> +	} else {
> +		uint16_t nr_copy = vq->size - vq->last_avail_idx;

It needs a blank line.

> +		rte_memcpy(async_descs, &vq->desc_packed[vq-
> >last_avail_idx],
> +			nr_copy * sizeof(struct vring_packed_desc));
> +		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
> +			(*nr_descs - nr_copy) * sizeof(struct
> vring_packed_desc));
> +	}
> +
> +	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> buffer_desc_count, *nr_buffers);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline int16_t
> +virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> +			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t
> *nr_buffers,
> +			    struct vring_packed_desc *async_descs,
> +			    struct iovec *src_iovec, struct iovec *dst_iovec,
> +			    struct rte_vhost_iov_iter *src_it, struct
> rte_vhost_iov_iter *dst_it)
> +{
> +	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> +	*nr_descs = 0;
> +	*nr_buffers = 0;
> +
> +	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> buf_vec, nr_descs, nr_buffers,
> +						 async_descs, src_iovec,
> dst_iovec,
> +						 src_it, dst_it) < 0)) {
> +		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc
> from vring\n", dev->vid);
> +		return -1;
> +	}
> +
> +	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> index %d\n",
> +			dev->vid, vq->last_avail_idx, vq->last_avail_idx +
> *nr_descs);
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline void
> +dma_error_handler_packed(struct vhost_virtqueue *vq, struct
> vring_packed_desc *async_descs,
> +			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t
> nr_err,
> +			uint32_t *pkt_idx, uint32_t *num_async_pkts,
> uint32_t *num_done_pkts)
> +{
> +	uint16_t descs_err = 0;
> +	uint16_t buffers_err = 0;
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> +
> +	*num_async_pkts -= nr_err;
> +	*pkt_idx -= nr_err;
> +	/* calculate the sum of buffers and descs of DMA-error packets. */
> +	while (nr_err-- > 0) {
> +		descs_err += pkts_info[slot_idx % vq->size].descs;
> +		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
> +		slot_idx--;
> +	}
> +
> +	vq->async_buffer_idx_packed -= buffers_err;
> +
> +	if (vq->last_avail_idx >= descs_err) {
> +		vq->last_avail_idx -= descs_err;
> +
> +		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> +			&async_descs[async_descs_idx - descs_err],
> +			descs_err * sizeof(struct vring_packed_desc));
> +	} else {
> +		uint16_t nr_copy;
> +
> +		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
> +		nr_copy = vq->size - vq->last_avail_idx;
> +		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> +			&async_descs[async_descs_idx - descs_err],
> +			nr_copy * sizeof(struct vring_packed_desc));
> +		descs_err -= nr_copy;
> +		rte_memcpy(&vq->desc_packed[0],
> &async_descs[async_descs_idx - descs_err],
> +			descs_err * sizeof(struct vring_packed_desc));
> +		vq->avail_wrap_counter ^= 1;
> +	}
> +
> +	*num_done_pkts = *pkt_idx - *num_async_pkts;
> +}
> +
> +static __rte_noinline uint32_t
> +virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
> +	struct vhost_virtqueue *vq, uint16_t queue_id,
> +	struct rte_mbuf **pkts, uint32_t count,
> +	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
> +{
> +	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> +	uint16_t async_descs_idx = 0;
> +	uint16_t num_buffers;
> +	uint16_t num_desc;
How about using "num_descs" to make naming consist with "num_buffers"?

Thanks,
Jiayu

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost
  2021-04-27  5:16     ` Hu, Jiayu
@ 2021-04-27  6:07       ` Jiang, Cheng1
  0 siblings, 0 replies; 60+ messages in thread
From: Jiang, Cheng1 @ 2021-04-27  6:07 UTC (permalink / raw)
  To: Hu, Jiayu, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Hi Jiayu,

> -----Original Message-----
> From: Hu, Jiayu <jiayu.hu@intel.com>
> Sent: Tuesday, April 27, 2021 1:16 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Yang, YvonneX <yvonnex.yang@intel.com>; Wang, Yinan
> <yinan.wang@intel.com>; Liu, Yong <yong.liu@intel.com>
> Subject: RE: [PATCH v8 2/4] vhost: add support for packed ring in async vhost
> 
> Hi Cheng,
> 
> Some comments are inline.
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Monday, April 19, 2021 4:51 PM
> > To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> > <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> > Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Subject: [PATCH v8 2/4] vhost: add support for packed ring in async
> > vhost
> >
> > For now async vhost data path only supports split ring. This patch
> > enables packed ring in async vhost data path to make async vhost
> > compatible with virtio 1.1 spec.
> >
> > Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> > ---
> >  lib/librte_vhost/rte_vhost_async.h |   1 +
> >  lib/librte_vhost/vhost.c           |  79 ++++--
> >  lib/librte_vhost/vhost.h           |  15 +-
> >  lib/librte_vhost/virtio_net.c      | 441 +++++++++++++++++++++++++++--
> >  4 files changed, 488 insertions(+), 48 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 438bdafd1..ce88ad3c0 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -363,14 +363,14 @@
> > vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue
> *vq,
> > }
> >
> >  static __rte_always_inline void
> > -vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > -   struct vhost_virtqueue *vq,
> > -   uint32_t len[],
> > -   uint16_t id[],
> > -   uint16_t count[],
> > +vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
> > +   uint32_t *len,
> > +   uint16_t *id,
> > +   uint16_t *count,
> >     uint16_t num_buffers)
> >  {
> >  uint16_t i;
> > +
> >  for (i = 0; i < num_buffers; i++) {
> >  /* enqueue shadow flush action aligned with batch num */  if
> > (!vq->shadow_used_idx) @@ -382,6 +382,17 @@
> > vhost_shadow_enqueue_single_packed(struct
> > virtio_net *dev,
> >  vq->shadow_aligned_idx += count[i];
> >  vq->shadow_used_idx++;
> >  }
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
> > +   struct vhost_virtqueue *vq,
> > +   uint32_t *len,
> > +   uint16_t *id,
> > +   uint16_t *count,
> > +   uint16_t num_buffers)
> > +{
> > +vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
> >
> >  if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
> > do_data_copy_enqueue(dev, vq); @@ -1474,6 +1485,23 @@
> > store_dma_desc_info_split(struct vring_used_elem *s_ring, struct
> > vring_used_elem  }  }
> >
> > +static __rte_always_inline void
> > +store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
> > +struct vring_used_elem_packed *d_ring, uint16_t ring_size, uint16_t
> > +s_idx, uint16_t d_idx, uint16_t
> > count)
> > +{
> > +uint16_t elem_size = sizeof(struct vring_used_elem_packed);
> > +
> > +if (d_idx + count <= ring_size) {
> > +rte_memcpy(d_ring + d_idx, s_ring + s_idx, count *
> > elem_size);
> > +} else {
> > +uint16_t size = ring_size - d_idx;
> > +
> > +rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
> > +rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) *
> > elem_size);
> > +}
> > +}
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,  struct
> > vhost_virtqueue *vq, uint16_t queue_id, @@ -1556,12 +1584,12 @@
> > virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >   * descriptors.
> >   */
> >  from = vq->shadow_used_idx - num_buffers; -to = vq->async_desc_idx &
> > (vq->size - 1);
> > +to = vq->async_desc_idx_split & (vq->size - 1);
> >
> >  store_dma_desc_info_split(vq->shadow_used_split,
> >  vq->async_descs_split, vq->size, from, to, num_buffers);
> >
> > -vq->async_desc_idx += num_buffers;
> > +vq->async_desc_idx_split += num_buffers;
> >  vq->shadow_used_idx -= num_buffers;
> >  } else
> >  comp_pkts[num_done_pkts++] = pkts[pkt_idx]; @@ -1619,7 +1647,7 @@
> > virtio_dev_rx_async_submit_split(struct virtio_net *dev,  num_descs +=
> > pkts_info[slot_idx & (vq->size - 1)].descs;  slot_idx--;  }
> > -vq->async_desc_idx -= num_descs;
> > +vq->async_desc_idx_split -= num_descs;
> >  /* recover shadow used ring and available ring */
> > vq->shadow_used_idx -= (vq->last_avail_idx -
> >
> > async_pkts_log[num_async_pkts].last_avail_idx - @@ -1641,6 +1669,329
> > @@ virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev,
> >  return pkt_idx;
> >  }
> >
> > +static __rte_always_inline void
> > +vhost_update_used_packed(struct vhost_virtqueue *vq, struct
> > +vring_used_elem_packed *shadow_ring, uint16_t count) { int i;
> > +uint16_t used_idx = vq->last_used_idx; uint16_t head_idx =
> > +vq->last_used_idx; uint16_t head_flags = 0;
> > +
> > +if (count == 0)
> > +return;
> > +
> > +/* Split loop in two to save memory barriers */ for (i = 0; i <
> > +count; i++) {
> > +vq->desc_packed[used_idx].id = shadow_ring[i].id;
> > +vq->desc_packed[used_idx].len = shadow_ring[i].len;
> > +
> > +used_idx += shadow_ring[i].count;
> > +if (used_idx >= vq->size)
> > +used_idx -= vq->size;
> > +}
> > +
> > +/* The ordering for storing desc flags needs to be enforced. */
> > +rte_atomic_thread_fence(__ATOMIC_RELEASE);
> > +
> > +for (i = 0; i < count; i++) {
> > +uint16_t flags;
> > +
> > +if (vq->shadow_used_packed[i].len)
> > +flags = VRING_DESC_F_WRITE;
> > +else
> > +flags = 0;
> > +
> > +if (vq->used_wrap_counter) {
> > +flags |= VRING_DESC_F_USED;
> > +flags |= VRING_DESC_F_AVAIL;
> > +} else {
> > +flags &= ~VRING_DESC_F_USED;
> > +flags &= ~VRING_DESC_F_AVAIL;
> > +}
> > +
> > +if (i > 0) {
> > +vq->desc_packed[vq->last_used_idx].flags = flags;
> > +} else {
> > +head_idx = vq->last_used_idx;
> > +head_flags = flags;
> > +}
> > +
> > +vq_inc_last_used_packed(vq, shadow_ring[i].count); }
> > +
> > +vq->desc_packed[head_idx].flags = head_flags;
> > +}
> > +
> > +static __rte_always_inline int
> > +vhost_enqueue_async_single_packed(struct virtio_net *dev,
> > +    struct vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt,
> > +    struct buf_vector *buf_vec,
> > +    uint16_t *nr_descs,
> > +    uint16_t *nr_buffers,
> > +    struct vring_packed_desc *async_descs,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it,
> > +    struct rte_vhost_iov_iter *dst_it) { uint16_t nr_vec = 0;
> > +uint16_t avail_idx = vq->last_avail_idx; uint16_t max_tries, tries =
> > +0; uint16_t buf_id = 0; uint32_t len = 0; uint16_t desc_count = 0;
> > +uint32_t size = pkt->pkt_len + sizeof(struct
> > virtio_net_hdr_mrg_rxbuf);
> > +uint32_t buffer_len[vq->size];
> > +uint16_t buffer_buf_id[vq->size];
> > +uint16_t buffer_desc_count[vq->size]; *nr_buffers = 0;
> nr_buffers and nr_descs are pointers of num_buffers and num_desc in
> virtio_dev_rx_async_submit_packed(), and num_buffers and num_desc
> don't have init values. I think you need to init them before pass their pointers
> to another function, as it will read/update values pointed by the pointers.
> 
> In addition, *nr_buffers is set to 0, but *nr_descs is not, and both of them
> are set to
> 0 in virtio_dev_rx_async_single_packed(). It looks strange.

Sure, I'll init them in virtio_dev_rx_async_submit_packed().

> 
> > +
> > +if (rxvq_is_mergeable(dev))
> > +max_tries = vq->size - 1;
> > +else
> > +max_tries = 1;
> > +
> > +while (size > 0) {
> > +/*
> > + * if we tried all available ring items, and still
> > + * can't get enough buf, it means something abnormal
> > + * happened.
> > + */
> > +if (unlikely(++tries > max_tries))
> > +return -1;
> > +
> > +if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx,
> > &desc_count, buf_vec, &nr_vec,
> > +&buf_id, &len,
> > VHOST_ACCESS_RW) < 0))
> > +return -1;
> > +
> > +len = RTE_MIN(len, size);
> > +size -= len;
> > +
> > +buffer_len[*nr_buffers] = len;
> > +buffer_buf_id[*nr_buffers] = buf_id;
> > +buffer_desc_count[*nr_buffers] = desc_count; *nr_buffers += 1;
> > +
> > +*nr_descs += desc_count;
> > +avail_idx += desc_count;
> > +if (avail_idx >= vq->size)
> > +avail_idx -= vq->size;
> > +}
> > +
> > +if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers,
> > src_iovec, dst_iovec,
> > +src_it, dst_it) < 0)
> > +return -1;
> > +/* store descriptors for DMA */
> > +if (avail_idx >= *nr_descs) {
> > +rte_memcpy(async_descs, &vq->desc_packed[vq-
> > >last_avail_idx],
> > +*nr_descs * sizeof(struct vring_packed_desc)); } else { uint16_t
> > +nr_copy = vq->size - vq->last_avail_idx;
> 
> It needs a blank line.

OK.

> 
> > +rte_memcpy(async_descs, &vq->desc_packed[vq-
> > >last_avail_idx],
> > +nr_copy * sizeof(struct vring_packed_desc)); rte_memcpy(async_descs +
> > +nr_copy, vq->desc_packed, (*nr_descs - nr_copy) * sizeof(struct
> > vring_packed_desc));
> > +}
> > +
> > +vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id,
> > buffer_desc_count, *nr_buffers);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_always_inline int16_t
> > +virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct
> > vhost_virtqueue *vq,
> > +    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t
> > *nr_buffers,
> > +    struct vring_packed_desc *async_descs,
> > +    struct iovec *src_iovec, struct iovec *dst_iovec,
> > +    struct rte_vhost_iov_iter *src_it, struct
> > rte_vhost_iov_iter *dst_it)
> > +{
> > +struct buf_vector buf_vec[BUF_VECTOR_MAX]; *nr_descs = 0;
> *nr_buffers
> > += 0;
> > +
> > +if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt,
> > buf_vec, nr_descs, nr_buffers,
> > + async_descs, src_iovec,
> > dst_iovec,
> > + src_it, dst_it) < 0)) {
> > +VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc
> > from vring\n", dev->vid);
> > +return -1;
> > +}
> > +
> > +VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end
> > index %d\n",
> > +dev->vid, vq->last_avail_idx, vq->last_avail_idx +
> > *nr_descs);
> > +
> > +return 0;
> > +}
> > +
> > +static __rte_always_inline void
> > +dma_error_handler_packed(struct vhost_virtqueue *vq, struct
> > vring_packed_desc *async_descs,
> > +uint16_t async_descs_idx, uint16_t slot_idx, uint32_t
> > nr_err,
> > +uint32_t *pkt_idx, uint32_t *num_async_pkts,
> > uint32_t *num_done_pkts)
> > +{
> > +uint16_t descs_err = 0;
> > +uint16_t buffers_err = 0;
> > +struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > +
> > +*num_async_pkts -= nr_err;
> > +*pkt_idx -= nr_err;
> > +/* calculate the sum of buffers and descs of DMA-error packets. */
> > +while (nr_err-- > 0) { descs_err += pkts_info[slot_idx %
> > +vq->size].descs; buffers_err += pkts_info[slot_idx %
> > +vq->size].nr_buffers; slot_idx--; }
> > +
> > +vq->async_buffer_idx_packed -= buffers_err;
> > +
> > +if (vq->last_avail_idx >= descs_err) {
> > +vq->last_avail_idx -= descs_err;
> > +
> > +rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> > +&async_descs[async_descs_idx - descs_err], descs_err * sizeof(struct
> > +vring_packed_desc)); } else { uint16_t nr_copy;
> > +
> > +vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
> > +nr_copy = vq->size - vq->last_avail_idx;
> > +rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
> > +&async_descs[async_descs_idx - descs_err], nr_copy * sizeof(struct
> > +vring_packed_desc)); descs_err -= nr_copy;
> > +rte_memcpy(&vq->desc_packed[0],
> > &async_descs[async_descs_idx - descs_err],
> > +descs_err * sizeof(struct vring_packed_desc));
> > +vq->avail_wrap_counter ^= 1;
> > +}
> > +
> > +*num_done_pkts = *pkt_idx - *num_async_pkts; }
> > +
> > +static __rte_noinline uint32_t
> > +virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct
> > +vhost_virtqueue *vq, uint16_t queue_id, struct rte_mbuf **pkts,
> > +uint32_t count, struct rte_mbuf **comp_pkts, uint32_t *comp_count) {
> > +uint32_t pkt_idx = 0, pkt_burst_idx = 0; uint16_t async_descs_idx =
> > +0; uint16_t num_buffers; uint16_t num_desc;
> How about using "num_descs" to make naming consist with "num_buffers"?

Sure, that make sense, thanks a lot.
Cheng

> 
> Thanks,
> Jiayu


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost
  2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
                   ` (6 preceding siblings ...)
  2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-27  8:03 ` Cheng Jiang
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
                     ` (4 more replies)
  7 siblings, 5 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-27  8:03 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring structure. In
order to make async vhost compatible with virtio 1.1 spec this patch
set cleans async split ring codes and enables packed ring in async
vhost data path. Batch datapath is also enabled in async vhost
packed ring.

v9:
 * improve some variable initiation
 * fix some variable names for consistency
 * rebase on the latest code
v8:
 * fix some variable names for consistency
 * clean codes
v7:
 * fix compile issues
 * add argument *dev in vhost_free_async_mem() for ring type decision
v6:
 * fix some typos in commit log
 * improve index usage
 * remove shadow_ring_store()
 * add store_dma_desc_info_split() store_dma_desc_info_packed()
 * remove some checks in vhost_free_async_mem()
 * change index calculation since the size isn't necessarily a power of 2
 * move error handling in a dedicated function
 * clean codes
v5:
 * clean some codes for packed ring datapath
 * fix an index error in shadow_ring_store()
v4:
  * change the patch structure
  * clean code for async split ring
  * reuse some code from split ring
  * change the error handler for DMA-copy packet
  * add check for malloc
  * remove useless code
  * add doc update
v3:
  * fix error handler for DMA-copy packet
v2:
  * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
  * add async_buffers_packed memory free in vhost_free_async_mem()

Cheng Jiang (4):
  vhost: abstract and reorganize async split ring code
  vhost: add support for packed ring in async vhost
  vhost: add batch datapath for async vhost packed ring
  doc: add release note for vhost async packed ring

 doc/guides/rel_notes/release_21_05.rst |   4 +
 lib/vhost/rte_vhost_async.h            |   1 +
 lib/vhost/vhost.c                      |  79 +++-
 lib/vhost/vhost.h                      |  15 +-
 lib/vhost/virtio_net.c                 | 598 +++++++++++++++++++++----
 5 files changed, 587 insertions(+), 110 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v9 1/4] vhost: abstract and reorganize async split ring code
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
@ 2021-04-27  8:03   ` Cheng Jiang
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-27  8:03 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

This patch puts some codes of async vhost split ring into inline
functions to improve the readability of the code. And, it changes
the pointer index style of iterator to make the code more concise.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Jiayu Hu <jiayu.hu@intel.com>
---
 lib/vhost/virtio_net.c | 132 ++++++++++++++++++++---------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index ff39878609..438bdafd14 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -1458,6 +1458,22 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
+static __rte_always_inline void
+store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1474,10 +1490,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
 	struct iovec *src_iovec = vec_pool;
 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-	struct rte_vhost_iov_iter *src_it = it_pool;
-	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t slot_idx = 0;
 	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	uint32_t n_pkts = 0, pkt_err = 0;
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
@@ -1511,29 +1526,30 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dev->vid, vq->last_avail_idx,
 			vq->last_avail_idx + num_buffers);
 
-		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
-				buf_vec, nr_vec, num_buffers,
-				src_iovec, dst_iovec, src_it, dst_it) < 0) {
+		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
+				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
 			(vq->size - 1);
-		if (src_it->count) {
+		if (it_pool[it_idx].count) {
 			uint16_t from, to;
 
-			async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
 			pkts_info[slot_idx].descs = num_buffers;
 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
 			async_pkts_log[num_async_pkts++].last_avail_idx =
 				vq->last_avail_idx;
-			src_iovec += src_it->nr_segs;
-			dst_iovec += dst_it->nr_segs;
-			src_it += 2;
-			dst_it += 2;
-			segs_await += src_it->nr_segs;
+
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
 
 			/**
 			 * recover shadow used ring and keep DMA-occupied
@@ -1541,23 +1557,10 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 */
 			from = vq->shadow_used_idx - num_buffers;
 			to = vq->async_desc_idx & (vq->size - 1);
-			if (num_buffers + to <= vq->size) {
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						num_buffers *
-						sizeof(struct vring_used_elem));
-			} else {
-				int size = vq->size - to;
-
-				rte_memcpy(&vq->async_descs_split[to],
-						&vq->shadow_used_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->async_descs_split,
-						&vq->shadow_used_split[from +
-						size], (num_buffers - size) *
-					   sizeof(struct vring_used_elem));
-			}
+
+			store_dma_desc_info_split(vq->shadow_used_split,
+					vq->async_descs_split, vq->size, from, to, num_buffers);
+
 			vq->async_desc_idx += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
@@ -1575,10 +1578,9 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			BUF_VECTOR_MAX))) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
-			src_iovec = vec_pool;
-			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
-			src_it = it_pool;
-			dst_it = it_pool + 1;
+			iovec_idx = 0;
+			it_idx = 0;
+
 			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
@@ -1639,6 +1641,36 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
+{
+	uint16_t nr_left = n_descs;
+	uint16_t nr_copy;
+	uint16_t to, from;
+
+	do {
+		from = vq->last_async_desc_idx & (vq->size - 1);
+		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
+		to = vq->last_used_idx & (vq->size - 1);
+
+		if (to + nr_copy <= vq->size) {
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					nr_copy * sizeof(struct vring_used_elem));
+		} else {
+			uint16_t size = vq->size - to;
+
+			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
+					size * sizeof(struct vring_used_elem));
+			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
+					(nr_copy - size) * sizeof(struct vring_used_elem));
+		}
+
+		vq->last_async_desc_idx += nr_copy;
+		vq->last_used_idx += nr_copy;
+		nr_left -= nr_copy;
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
@@ -1695,39 +1727,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		uint16_t nr_left = n_descs;
-		uint16_t nr_copy;
-		uint16_t to;
-
-		/* write back completed descriptors to used ring */
-		do {
-			from = vq->last_async_desc_idx & (vq->size - 1);
-			nr_copy = nr_left + from <= vq->size ? nr_left :
-				vq->size - from;
-			to = vq->last_used_idx & (vq->size - 1);
-
-			if (to + nr_copy <= vq->size) {
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						nr_copy *
-						sizeof(struct vring_used_elem));
-			} else {
-				uint16_t size = vq->size - to;
-
-				rte_memcpy(&vq->used->ring[to],
-						&vq->async_descs_split[from],
-						size *
-						sizeof(struct vring_used_elem));
-				rte_memcpy(vq->used->ring,
-						&vq->async_descs_split[from +
-						size], (nr_copy - size) *
-						sizeof(struct vring_used_elem));
-			}
-
-			vq->last_async_desc_idx += nr_copy;
-			vq->last_used_idx += nr_copy;
-			nr_left -= nr_copy;
-		} while (nr_left > 0);
+		write_back_completed_descs_split(vq, n_descs);
 
 		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
 		vhost_vring_call_split(dev, vq);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
@ 2021-04-27  8:03   ` Cheng Jiang
  2021-04-29  1:48     ` Hu, Jiayu
  2021-04-29  9:50     ` Maxime Coquelin
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
                     ` (2 subsequent siblings)
  4 siblings, 2 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-27  8:03 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

For now async vhost data path only supports split ring. This patch
enables packed ring in async vhost data path to make async vhost
compatible with virtio 1.1 spec.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/vhost/rte_vhost_async.h |   1 +
 lib/vhost/vhost.c           |  79 +++++--
 lib/vhost/vhost.h           |  15 +-
 lib/vhost/virtio_net.c      | 442 ++++++++++++++++++++++++++++++++++--
 4 files changed, 489 insertions(+), 48 deletions(-)

diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index c855ff875e..6faa31f5ad 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -89,6 +89,7 @@ struct rte_vhost_async_channel_ops {
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
 	uint16_t descs; /* num of descs inflight */
+	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
 /**
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index a70fe01d8f..2e3f9eb095 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -340,17 +340,17 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
-	if (vq->async_pkts_info)
-		rte_free(vq->async_pkts_info);
-	if (vq->async_descs_split)
-		rte_free(vq->async_descs_split);
-	if (vq->it_pool)
-		rte_free(vq->it_pool);
-	if (vq->vec_pool)
-		rte_free(vq->vec_pool);
+	rte_free(vq->async_pkts_info);
 
-	vq->async_pkts_info = NULL;
+	rte_free(vq->async_buffers_packed);
+	vq->async_buffers_packed = NULL;
+	rte_free(vq->async_descs_split);
 	vq->async_descs_split = NULL;
+
+	rte_free(vq->it_pool);
+	rte_free(vq->vec_pool);
+
+	vq->async_pkts_info = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
 }
@@ -360,10 +360,10 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
 	if (vq_is_packed(dev))
 		rte_free(vq->shadow_used_packed);
-	else {
+	else
 		rte_free(vq->shadow_used_split);
-		vhost_free_async_mem(vq);
-	}
+
+	vhost_free_async_mem(vq);
 	rte_free(vq->batch_copy_elems);
 	if (vq->iotlb_pool)
 		rte_mempool_free(vq->iotlb_pool);
@@ -1626,10 +1626,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	if (unlikely(vq == NULL || !dev->async_copy))
 		return -1;
 
-	/* packed queue is not supported */
-	if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
+	if (unlikely(!f.async_inorder)) {
 		VHOST_LOG_CONFIG(ERR,
-			"async copy is not supported on packed queue or non-inorder mode "
+			"async copy is not supported on non-inorder mode "
 			"(vid %d, qid: %d)\n", vid, queue_id);
 		return -1;
 	}
@@ -1661,24 +1660,60 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_info = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->async_pkts_info) {
+		vhost_free_async_mem(vq);
+		VHOST_LOG_CONFIG(ERR,
+			"async register failed: cannot allocate memory for async_pkts_info "
+			"(vid %d, qid: %d)\n", vid, queue_id);
+		goto reg_out;
+	}
+
 	vq->it_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
 			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->it_pool) {
+		vhost_free_async_mem(vq);
+		VHOST_LOG_CONFIG(ERR,
+			"async register failed: cannot allocate memory for it_pool "
+			"(vid %d, qid: %d)\n", vid, queue_id);
+		goto reg_out;
+	}
+
 	vq->vec_pool = rte_malloc_socket(NULL,
 			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
 			RTE_CACHE_LINE_SIZE, node);
-	vq->async_descs_split = rte_malloc_socket(NULL,
-			vq->size * sizeof(struct vring_used_elem),
-			RTE_CACHE_LINE_SIZE, node);
-	if (!vq->async_descs_split || !vq->async_pkts_info ||
-		!vq->it_pool || !vq->vec_pool) {
+	if (!vq->vec_pool) {
 		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
-				"async register failed: cannot allocate memory for vq data "
-				"(vid %d, qid: %d)\n", vid, queue_id);
+			"async register failed: cannot allocate memory for vec_pool "
+			"(vid %d, qid: %d)\n", vid, queue_id);
 		goto reg_out;
 	}
 
+	if (vq_is_packed(dev)) {
+		vq->async_buffers_packed = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem_packed),
+			RTE_CACHE_LINE_SIZE, node);
+		if (!vq->async_buffers_packed) {
+			vhost_free_async_mem(vq);
+			VHOST_LOG_CONFIG(ERR,
+				"async register failed: cannot allocate memory for async buffers "
+				"(vid %d, qid: %d)\n", vid, queue_id);
+			goto reg_out;
+		}
+	} else {
+		vq->async_descs_split = rte_malloc_socket(NULL,
+			vq->size * sizeof(struct vring_used_elem),
+			RTE_CACHE_LINE_SIZE, node);
+		if (!vq->async_descs_split) {
+			vhost_free_async_mem(vq);
+			VHOST_LOG_CONFIG(ERR,
+				"async register failed: cannot allocate memory for async descs "
+				"(vid %d, qid: %d)\n", vid, queue_id);
+			goto reg_out;
+		}
+	}
+
 	vq->async_ops.check_completed_copies = ops->check_completed_copies;
 	vq->async_ops.transfer_data = ops->transfer_data;
 
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index f628714c24..b303635645 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -201,9 +201,18 @@ struct vhost_virtqueue {
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
 	uint16_t	async_last_pkts_n;
-	struct vring_used_elem  *async_descs_split;
-	uint16_t async_desc_idx;
-	uint16_t last_async_desc_idx;
+	union {
+		struct vring_used_elem  *async_descs_split;
+		struct vring_used_elem_packed *async_buffers_packed;
+	};
+	union {
+		uint16_t async_desc_idx_split;
+		uint16_t async_buffer_idx_packed;
+	};
+	union {
+		uint16_t last_async_desc_idx_split;
+		uint16_t last_async_buffer_idx_packed;
+	};
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 438bdafd14..5d540e5599 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -363,14 +363,14 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline void
-vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
-				   struct vhost_virtqueue *vq,
-				   uint32_t len[],
-				   uint16_t id[],
-				   uint16_t count[],
+vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
 				   uint16_t num_buffers)
 {
 	uint16_t i;
+
 	for (i = 0; i < num_buffers; i++) {
 		/* enqueue shadow flush action aligned with batch num */
 		if (!vq->shadow_used_idx)
@@ -382,6 +382,17 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 		vq->shadow_aligned_idx += count[i];
 		vq->shadow_used_idx++;
 	}
+}
+
+static __rte_always_inline void
+vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
+				   struct vhost_virtqueue *vq,
+				   uint32_t *len,
+				   uint16_t *id,
+				   uint16_t *count,
+				   uint16_t num_buffers)
+{
+	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 
 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 		do_data_copy_enqueue(dev, vq);
@@ -1474,6 +1485,23 @@ store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem
 	}
 }
 
+static __rte_always_inline void
+store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
+		struct vring_used_elem_packed *d_ring,
+		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
+{
+	uint16_t elem_size = sizeof(struct vring_used_elem_packed);
+
+	if (d_idx + count <= ring_size) {
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
+	} else {
+		uint16_t size = ring_size - d_idx;
+
+		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
+		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
+	}
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1556,12 +1584,12 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			 * descriptors.
 			 */
 			from = vq->shadow_used_idx - num_buffers;
-			to = vq->async_desc_idx & (vq->size - 1);
+			to = vq->async_desc_idx_split & (vq->size - 1);
 
 			store_dma_desc_info_split(vq->shadow_used_split,
 					vq->async_descs_split, vq->size, from, to, num_buffers);
 
-			vq->async_desc_idx += num_buffers;
+			vq->async_desc_idx_split += num_buffers;
 			vq->shadow_used_idx -= num_buffers;
 		} else
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
@@ -1619,7 +1647,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
 			slot_idx--;
 		}
-		vq->async_desc_idx -= num_descs;
+		vq->async_desc_idx_split -= num_descs;
 		/* recover shadow used ring and available ring */
 		vq->shadow_used_idx -= (vq->last_avail_idx -
 				async_pkts_log[num_async_pkts].last_avail_idx -
@@ -1641,6 +1669,330 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	return pkt_idx;
 }
 
+static __rte_always_inline void
+vhost_update_used_packed(struct vhost_virtqueue *vq,
+			struct vring_used_elem_packed *shadow_ring,
+			uint16_t count)
+{
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	if (count == 0)
+		return;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < count; i++) {
+		vq->desc_packed[used_idx].id = shadow_ring[i].id;
+		vq->desc_packed[used_idx].len = shadow_ring[i].len;
+
+		used_idx += shadow_ring[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	/* The ordering for storing desc flags needs to be enforced. */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+
+	for (i = 0; i < count; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq_inc_last_used_packed(vq, shadow_ring[i].count);
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+}
+
+static __rte_always_inline int
+vhost_enqueue_async_single_packed(struct virtio_net *dev,
+			    struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt,
+			    struct buf_vector *buf_vec,
+			    uint16_t *nr_descs,
+			    uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it,
+			    struct rte_vhost_iov_iter *dst_it)
+{
+	uint16_t nr_vec = 0;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint16_t max_tries, tries = 0;
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count = 0;
+	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	uint32_t buffer_len[vq->size];
+	uint16_t buffer_buf_id[vq->size];
+	uint16_t buffer_desc_count[vq->size];
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
+						&buf_id, &len, VHOST_ACCESS_RW) < 0))
+			return -1;
+
+		len = RTE_MIN(len, size);
+		size -= len;
+
+		buffer_len[*nr_buffers] = len;
+		buffer_buf_id[*nr_buffers] = buf_id;
+		buffer_desc_count[*nr_buffers] = desc_count;
+		*nr_buffers += 1;
+
+		*nr_descs += desc_count;
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+	}
+
+	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
+			src_it, dst_it) < 0)
+		return -1;
+	/* store descriptors for DMA */
+	if (avail_idx >= *nr_descs) {
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			*nr_descs * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy = vq->size - vq->last_avail_idx;
+
+		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
+			nr_copy * sizeof(struct vring_packed_desc));
+		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
+			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
+	}
+
+	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
+
+	return 0;
+}
+
+static __rte_always_inline int16_t
+virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
+			    struct vring_packed_desc *async_descs,
+			    struct iovec *src_iovec, struct iovec *dst_iovec,
+			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+
+	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
+						 async_descs, src_iovec, dst_iovec,
+						 src_it, dst_it) < 0)) {
+		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
+		return -1;
+	}
+
+	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
+
+	return 0;
+}
+
+static __rte_always_inline void
+dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
+			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
+			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
+{
+	uint16_t descs_err = 0;
+	uint16_t buffers_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	*num_async_pkts -= nr_err;
+	*pkt_idx -= nr_err;
+	/* calculate the sum of buffers and descs of DMA-error packets. */
+	while (nr_err-- > 0) {
+		descs_err += pkts_info[slot_idx % vq->size].descs;
+		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
+		slot_idx--;
+	}
+
+	vq->async_buffer_idx_packed -= buffers_err;
+
+	if (vq->last_avail_idx >= descs_err) {
+		vq->last_avail_idx -= descs_err;
+
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+	} else {
+		uint16_t nr_copy;
+
+		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
+		nr_copy = vq->size - vq->last_avail_idx;
+		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
+			&async_descs[async_descs_idx - descs_err],
+			nr_copy * sizeof(struct vring_packed_desc));
+		descs_err -= nr_copy;
+		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
+			descs_err * sizeof(struct vring_packed_desc));
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	*num_done_pkts = *pkt_idx - *num_async_pkts;
+}
+
+static __rte_noinline uint32_t
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
+	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count,
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+{
+	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint16_t async_descs_idx = 0;
+	uint16_t num_buffers;
+	uint16_t num_descs;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t num_async_pkts = 0, num_done_pkts = 0;
+	struct vring_packed_desc async_descs[vq->size];
+
+	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		num_buffers = 0;
+		num_descs = 0;
+
+		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
+						&num_descs, &num_buffers,
+						&async_descs[async_descs_idx],
+						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
+						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
+			break;
+
+		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
+			dev->vid, vq->last_avail_idx,
+			vq->last_avail_idx + num_descs);
+
+		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
+		if (it_pool[it_idx].count) {
+			uint16_t from, to;
+
+			async_descs_idx += num_descs;
+			async_fill_desc(&tdes[pkt_burst_idx++],
+				&it_pool[it_idx], &it_pool[it_idx + 1]);
+			pkts_info[slot_idx].descs = num_descs;
+			pkts_info[slot_idx].nr_buffers = num_buffers;
+			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+			num_async_pkts++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/**
+			 * recover shadow used ring and keep DMA-occupied
+			 * descriptors.
+			 */
+			from = vq->shadow_used_idx - num_buffers;
+			to = vq->async_buffer_idx_packed % vq->size;
+			store_dma_desc_info_packed(vq->shadow_used_packed,
+					vq->async_buffers_packed, vq->size, from, to, num_buffers);
+
+			vq->async_buffer_idx_packed += num_buffers;
+			vq->shadow_used_idx -= num_buffers;
+		} else {
+			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
+		}
+
+		vq_inc_last_avail_packed(vq, num_descs);
+
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - unused async iov number is less than max vhost vector
+		 */
+		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
+			n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, pkt_burst_idx);
+			iovec_idx = 0;
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += n_pkts;
+
+			if (unlikely(n_pkts < pkt_burst_idx)) {
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				pkt_idx++;
+				break;
+			}
+
+			pkt_burst_idx = 0;
+		}
+	}
+
+	if (pkt_burst_idx) {
+		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
+		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
+	}
+
+	do_data_copy_enqueue(dev, vq);
+
+	if (unlikely(pkt_err))
+		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
+					&pkt_idx, &num_async_pkts, &num_done_pkts);
+	vq->async_pkts_idx += num_async_pkts;
+	*comp_count = num_done_pkts;
+
+	if (likely(vq->shadow_used_idx)) {
+		vhost_flush_enqueue_shadow_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+
+	return pkt_idx;
+}
+
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 {
@@ -1649,7 +2001,7 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 	uint16_t to, from;
 
 	do {
-		from = vq->last_async_desc_idx & (vq->size - 1);
+		from = vq->last_async_desc_idx_split & (vq->size - 1);
 		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
 		to = vq->last_used_idx & (vq->size - 1);
 
@@ -1665,18 +2017,41 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 					(nr_copy - size) * sizeof(struct vring_used_elem));
 		}
 
-		vq->last_async_desc_idx += nr_copy;
+		vq->last_async_desc_idx_split += nr_copy;
 		vq->last_used_idx += nr_copy;
 		nr_left -= nr_copy;
 	} while (nr_left > 0);
 }
 
+static __rte_always_inline void
+write_back_completed_descs_packed(struct vhost_virtqueue *vq,
+				uint16_t n_buffers)
+{
+	uint16_t nr_left = n_buffers;
+	uint16_t from, to;
+
+	do {
+		from = vq->last_async_buffer_idx_packed % vq->size;
+		to = (from + nr_left) % vq->size;
+		if (to > from) {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
+			vq->last_async_buffer_idx_packed += nr_left;
+			nr_left = 0;
+		} else {
+			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
+				vq->size - from);
+			vq->last_async_buffer_idx_packed += vq->size - from;
+			nr_left -= vq->size - from;
+		}
+	} while (nr_left > 0);
+}
+
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	struct async_inflight_info *pkts_info;
 	uint16_t from, i;
@@ -1701,7 +2076,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	rte_spinlock_lock(&vq->access_lock);
 
-	pkts_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_idx = vq->async_pkts_idx % vq->size;
 	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
@@ -1718,21 +2093,41 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		goto done;
 	}
 
-	for (i = 0; i < n_pkts_put; i++) {
-		from = (start_idx + i) & (vq_size - 1);
-		n_descs += pkts_info[from].descs;
-		pkts[i] = pkts_info[from].mbuf;
+	if (vq_is_packed(dev)) {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_buffers += pkts_info[from].nr_buffers;
+			pkts[i] = pkts_info[from].mbuf;
+		}
+	} else {
+		for (i = 0; i < n_pkts_put; i++) {
+			from = (start_idx + i) & (vq_size - 1);
+			n_descs += pkts_info[from].descs;
+			pkts[i] = pkts_info[from].mbuf;
+		}
 	}
+
 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
 	vq->async_pkts_inflight_n -= n_pkts_put;
 
 	if (likely(vq->enabled && vq->access_ok)) {
-		write_back_completed_descs_split(vq, n_descs);
+		if (vq_is_packed(dev)) {
+			write_back_completed_descs_packed(vq, n_buffers);
 
-		__atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
-		vhost_vring_call_split(dev, vq);
-	} else
-		vq->last_async_desc_idx += n_descs;
+			vhost_vring_call_packed(dev, vq);
+		} else {
+			write_back_completed_descs_split(vq, n_descs);
+
+			__atomic_add_fetch(&vq->used->idx, n_descs,
+					__ATOMIC_RELEASE);
+			vhost_vring_call_split(dev, vq);
+		}
+	} else {
+		if (vq_is_packed(dev))
+			vq->last_async_buffer_idx_packed += n_buffers;
+		else
+			vq->last_async_desc_idx_split += n_descs;
+	}
 
 done:
 	rte_spinlock_unlock(&vq->access_lock);
@@ -1773,9 +2168,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		goto out;
 
-	/* TODO: packed queue not implemented */
 	if (vq_is_packed(dev))
-		nb_tx = 0;
+		nb_tx = virtio_dev_rx_async_submit_packed(dev,
+				vq, queue_id, pkts, count, comp_pkts,
+				comp_count);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v9 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-27  8:03   ` Cheng Jiang
  2021-04-29  9:57     ` Maxime Coquelin
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async " Cheng Jiang
  2021-05-04  8:28   ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Maxime Coquelin
  4 siblings, 1 reply; 60+ messages in thread
From: Cheng Jiang @ 2021-04-27  8:03 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add batch datapath for async vhost packed ring to improve the
performance of small packet processing.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 lib/vhost/virtio_net.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 5d540e5599..f60f97ec72 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -1724,6 +1724,29 @@ vhost_update_used_packed(struct vhost_virtqueue *vq,
 	vq->desc_packed[head_idx].flags = head_flags;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
+{
+	uint16_t i;
+	uint32_t cpy_threshold = vq->async_threshold;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
+			return -1;
+	}
+	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			comp_pkts[(*pkt_done)++] = pkts[i];
+
+		return 0;
+	}
+
+	return -1;
+}
+
 static __rte_always_inline int
 vhost_enqueue_async_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
@@ -1872,6 +1895,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t remained = count;
 	uint16_t async_descs_idx = 0;
 	uint16_t num_buffers;
 	uint16_t num_descs;
@@ -1889,12 +1913,19 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
 	struct vring_packed_desc async_descs[vq->size];
 
-	rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+	do {
+		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtio_dev_rx_async_batch_packed(dev, vq,
+				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
+				pkt_idx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		num_buffers = 0;
 		num_descs = 0;
-
 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
 						&num_descs, &num_buffers,
 						&async_descs[async_descs_idx],
@@ -1937,6 +1968,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 		}
 
+		pkt_idx++;
+		remained--;
 		vq_inc_last_avail_packed(vq, num_descs);
 
 		/*
@@ -1961,13 +1994,12 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 				 */
 				pkt_err = pkt_burst_idx - n_pkts;
 				pkt_burst_idx = 0;
-				pkt_idx++;
 				break;
 			}
 
 			pkt_burst_idx = 0;
 		}
-	}
+	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async packed ring
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
                     ` (2 preceding siblings ...)
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-27  8:03   ` Cheng Jiang
  2021-04-29  9:58     ` Maxime Coquelin
  2021-05-04 18:38     ` Ferruh Yigit
  2021-05-04  8:28   ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Maxime Coquelin
  4 siblings, 2 replies; 60+ messages in thread
From: Cheng Jiang @ 2021-04-27  8:03 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu, Cheng Jiang

Add release note for the support of vhost async packed ring.

Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
---
 doc/guides/rel_notes/release_21_05.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
index b3224dc332..aec020d558 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -271,6 +271,10 @@ New Features
   * Added support for crypto adapter forward mode in octeontx2 event and crypto
     device driver.
 
+* **Added support for vhost async packed ring data path.**
+
+  Added packed ring support for async vhost.
+
 
 Removed Items
 -------------
-- 
2.29.2


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
@ 2021-04-29  1:48     ` Hu, Jiayu
  2021-04-29  9:50     ` Maxime Coquelin
  1 sibling, 0 replies; 60+ messages in thread
From: Hu, Jiayu @ 2021-04-29  1:48 UTC (permalink / raw)
  To: Jiang, Cheng1, maxime.coquelin, Xia, Chenbo
  Cc: dev, Yang, YvonneX, Wang, Yinan, Liu, Yong

Reviewed-by: Jiayu Hu <jiayu.hu@intel.com>

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Tuesday, April 27, 2021 4:04 PM
> To: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Liu,
> Yong <yong.liu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v9 2/4] vhost: add support for packed ring in async vhost
> 
> For now async vhost data path only supports split ring. This patch
> enables packed ring in async vhost data path to make async vhost
> compatible with virtio 1.1 spec.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/vhost/rte_vhost_async.h |   1 +
>  lib/vhost/vhost.c           |  79 +++++--
>  lib/vhost/vhost.h           |  15 +-
>  lib/vhost/virtio_net.c      | 442 ++++++++++++++++++++++++++++++++++--
>  4 files changed, 489 insertions(+), 48 deletions(-)
> 


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
  2021-04-29  1:48     ` Hu, Jiayu
@ 2021-04-29  9:50     ` Maxime Coquelin
  1 sibling, 0 replies; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-29  9:50 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/27/21 10:03 AM, Cheng Jiang wrote:
> For now async vhost data path only supports split ring. This patch
> enables packed ring in async vhost data path to make async vhost
> compatible with virtio 1.1 spec.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/vhost/rte_vhost_async.h |   1 +
>  lib/vhost/vhost.c           |  79 +++++--
>  lib/vhost/vhost.h           |  15 +-
>  lib/vhost/virtio_net.c      | 442 ++++++++++++++++++++++++++++++++++--
>  4 files changed, 489 insertions(+), 48 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v9 3/4] vhost: add batch datapath for async vhost packed ring
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
@ 2021-04-29  9:57     ` Maxime Coquelin
  0 siblings, 0 replies; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-29  9:57 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/27/21 10:03 AM, Cheng Jiang wrote:
> Add batch datapath for async vhost packed ring to improve the
> performance of small packet processing.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  lib/vhost/virtio_net.c | 42 +++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 37 insertions(+), 5 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async packed ring
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async " Cheng Jiang
@ 2021-04-29  9:58     ` Maxime Coquelin
  2021-05-04 18:38     ` Ferruh Yigit
  1 sibling, 0 replies; 60+ messages in thread
From: Maxime Coquelin @ 2021-04-29  9:58 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/27/21 10:03 AM, Cheng Jiang wrote:
> Add release note for the support of vhost async packed ring.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>
> ---
>  doc/guides/rel_notes/release_21_05.rst | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/doc/guides/rel_notes/release_21_05.rst b/doc/guides/rel_notes/release_21_05.rst
> index b3224dc332..aec020d558 100644
> --- a/doc/guides/rel_notes/release_21_05.rst
> +++ b/doc/guides/rel_notes/release_21_05.rst
> @@ -271,6 +271,10 @@ New Features
>    * Added support for crypto adapter forward mode in octeontx2 event and crypto
>      device driver.
>  
> +* **Added support for vhost async packed ring data path.**
> +
> +  Added packed ring support for async vhost.
> +
>  
>  Removed Items
>  -------------
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost
  2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
                     ` (3 preceding siblings ...)
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async " Cheng Jiang
@ 2021-05-04  8:28   ` Maxime Coquelin
  4 siblings, 0 replies; 60+ messages in thread
From: Maxime Coquelin @ 2021-05-04  8:28 UTC (permalink / raw)
  To: Cheng Jiang, chenbo.xia; +Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu



On 4/27/21 10:03 AM, Cheng Jiang wrote:
> For now async vhost data path only supports split ring structure. In
> order to make async vhost compatible with virtio 1.1 spec this patch
> set cleans async split ring codes and enables packed ring in async
> vhost data path. Batch datapath is also enabled in async vhost
> packed ring.
> 
> v9:
>  * improve some variable initiation
>  * fix some variable names for consistency
>  * rebase on the latest code
> v8:
>  * fix some variable names for consistency
>  * clean codes
> v7:
>  * fix compile issues
>  * add argument *dev in vhost_free_async_mem() for ring type decision
> v6:
>  * fix some typos in commit log
>  * improve index usage
>  * remove shadow_ring_store()
>  * add store_dma_desc_info_split() store_dma_desc_info_packed()
>  * remove some checks in vhost_free_async_mem()
>  * change index calculation since the size isn't necessarily a power of 2
>  * move error handling in a dedicated function
>  * clean codes
> v5:
>  * clean some codes for packed ring datapath
>  * fix an index error in shadow_ring_store()
> v4:
>   * change the patch structure
>   * clean code for async split ring
>   * reuse some code from split ring
>   * change the error handler for DMA-copy packet
>   * add check for malloc
>   * remove useless code
>   * add doc update
> v3:
>   * fix error handler for DMA-copy packet
> v2:
>   * fix wrong buffer index in rte_vhost_poll_enqueue_completed()
>   * add async_buffers_packed memory free in vhost_free_async_mem()
> 
> Cheng Jiang (4):
>   vhost: abstract and reorganize async split ring code
>   vhost: add support for packed ring in async vhost
>   vhost: add batch datapath for async vhost packed ring
>   doc: add release note for vhost async packed ring
> 
>  doc/guides/rel_notes/release_21_05.rst |   4 +
>  lib/vhost/rte_vhost_async.h            |   1 +
>  lib/vhost/vhost.c                      |  79 +++-
>  lib/vhost/vhost.h                      |  15 +-
>  lib/vhost/virtio_net.c                 | 598 +++++++++++++++++++++----
>  5 files changed, 587 insertions(+), 110 deletions(-)
> 
> --
> 2.29.2
> 


Applied to dpdk-next-virtio/main.

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async packed ring
  2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async " Cheng Jiang
  2021-04-29  9:58     ` Maxime Coquelin
@ 2021-05-04 18:38     ` Ferruh Yigit
  1 sibling, 0 replies; 60+ messages in thread
From: Ferruh Yigit @ 2021-05-04 18:38 UTC (permalink / raw)
  To: Cheng Jiang, maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, yvonnex.yang, yinan.wang, yong.liu

On 4/27/2021 9:03 AM, Cheng Jiang wrote:
> Add release note for the support of vhost async packed ring.
> 
> Signed-off-by: Cheng Jiang <Cheng1.jiang@intel.com>

Squashed into relevant commit in next-net, thanks.

^ permalink raw reply	[flat|nested] 60+ messages in thread

end of thread, other threads:[~2021-05-04 18:38 UTC | newest]

Thread overview: 60+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-17  8:54 [dpdk-dev] [PATCH] vhost: add support for packed ring in async vhost Cheng Jiang
2021-03-22  6:15 ` [dpdk-dev] [PATCH v2] " Cheng Jiang
2021-03-24  9:19   ` Liu, Yong
2021-03-29 12:29     ` Jiang, Cheng1
2021-03-31 14:06 ` [dpdk-dev] [PATCH v3] " Cheng Jiang
2021-04-07  6:26   ` Hu, Jiayu
2021-04-08 12:01     ` Jiang, Cheng1
2021-04-10 10:25 ` [dpdk-dev] [PATCH v4 0/4] " Cheng Jiang
2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
2021-04-10 10:25   ` Cheng Jiang
2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
2021-04-10 10:25   ` [dpdk-dev] [PATCH v4 4/4] doc: add release note for vhost async " Cheng Jiang
2021-04-12 11:34 ` [dpdk-dev] [PATCH v5 0/4] add support for packed ring in async vhost Cheng Jiang
2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
2021-04-13  2:44     ` Hu, Jiayu
2021-04-13  3:26       ` Jiang, Cheng1
2021-04-13  7:11     ` Maxime Coquelin
2021-04-13  9:06       ` Jiang, Cheng1
2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
2021-04-13  8:36     ` Maxime Coquelin
2021-04-13 11:48       ` Jiang, Cheng1
2021-04-13 13:08         ` Maxime Coquelin
2021-04-13 13:50           ` Jiang, Cheng1
2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
2021-04-12 11:34   ` [dpdk-dev] [PATCH v5 4/4] doc: add release note for vhost async " Cheng Jiang
2021-04-13 14:55 ` [dpdk-dev] [PATCH v6 0/4] add support for packed ring in async vhost Cheng Jiang
2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
2021-04-13 14:55   ` [dpdk-dev] [PATCH v6 4/4] doc: add release note for vhost async " Cheng Jiang
2021-04-14  6:13 ` [dpdk-dev] [PATCH v7 0/4] add support for packed ring in async vhost Cheng Jiang
2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
2021-04-14 12:24     ` Maxime Coquelin
2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
2021-04-14 13:40     ` Maxime Coquelin
2021-04-15  5:42       ` Jiang, Cheng1
2021-04-15  2:02     ` Hu, Jiayu
2021-04-15  5:54       ` Jiang, Cheng1
2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
2021-04-14  6:13   ` [dpdk-dev] [PATCH v7 4/4] doc: add release note for vhost async " Cheng Jiang
2021-04-19  8:51 ` [dpdk-dev] [PATCH v8 0/4] add support for packed ring in async vhost Cheng Jiang
2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
2021-04-27  1:19     ` Hu, Jiayu
2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
2021-04-27  5:16     ` Hu, Jiayu
2021-04-27  6:07       ` Jiang, Cheng1
2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
2021-04-19  8:51   ` [dpdk-dev] [PATCH v8 4/4] doc: add release note for vhost async " Cheng Jiang
2021-04-27  8:03 ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Cheng Jiang
2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 1/4] vhost: abstract and reorganize async split ring code Cheng Jiang
2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 2/4] vhost: add support for packed ring in async vhost Cheng Jiang
2021-04-29  1:48     ` Hu, Jiayu
2021-04-29  9:50     ` Maxime Coquelin
2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 3/4] vhost: add batch datapath for async vhost packed ring Cheng Jiang
2021-04-29  9:57     ` Maxime Coquelin
2021-04-27  8:03   ` [dpdk-dev] [PATCH v9 4/4] doc: add release note for vhost async " Cheng Jiang
2021-04-29  9:58     ` Maxime Coquelin
2021-05-04 18:38     ` Ferruh Yigit
2021-05-04  8:28   ` [dpdk-dev] [PATCH v9 0/4] add support for packed ring in async vhost Maxime Coquelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).