DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring
@ 2021-06-02  8:31 Yuan Wang
  2021-06-02  8:31 ` [dpdk-dev] [PATCH 1/1] " Yuan Wang
                   ` (6 more replies)
  0 siblings, 7 replies; 50+ messages in thread
From: Yuan Wang @ 2021-06-02  8:31 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Yuan Wang

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

Yuan Wang (1):
  lib/vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |  10 +
 examples/vhost/ioat.c               |  30 +-
 examples/vhost/ioat.h               |   3 +
 examples/vhost/main.c               |  60 +--
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 549 ++++++++++++++++++++++++++++
 7 files changed, 664 insertions(+), 35 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 1/1] lib/vhost: support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
@ 2021-06-02  8:31 ` Yuan Wang
  2021-06-07 16:17   ` Maxime Coquelin
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 50+ messages in thread
From: Yuan Wang @ 2021-06-02  8:31 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Yuan Wang, Wenwu Ma, Jiayu Hu

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  10 +
 examples/vhost/ioat.c               |  30 +-
 examples/vhost/ioat.h               |   3 +
 examples/vhost/main.c               |  60 +--
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 549 ++++++++++++++++++++++++++++
 7 files changed, 664 insertions(+), 35 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index 6b7206bc1d..785ab0fb34 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  Try to receive packets from the guest with offloading large packets
+  to the DMA engine. Successfully dequeued packets are transfer
+  completed and returned in ``pkts``. But there may be other packets
+  that are sent from the guest but being transferred by the DMA engine,
+  called in-flight packets. This function will return in-flight packets
+  only after the DMA engine finishes transferring. The amount of
+  in-flight packets by now is returned in ``nr_inflight``.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..236306c9c7 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -61,18 +60,30 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		char *txd, *rxd;
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
+
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
 		if (sub_nr != 2) {
 			ret = -1;
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd == NULL && rxd == NULL) {
 			ret = -1;
 			goto out;
+		} else if (txd) {
+			is_txd = true;
+			start = txd;
+			ret |= ASYNC_RX_VHOST;
+		} else {
+			is_txd = false;
+			start = rxd;
+			ret |= ASYNC_TX_VHOST;
 		}
 
 		start += 3;
@@ -82,7 +93,8 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
 				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
@@ -113,7 +125,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -128,7 +139,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
+	uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
@@ -140,7 +151,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +166,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -181,8 +191,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
@@ -194,7 +203,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..db7acefc02 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -13,6 +13,9 @@
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
 
+#define ASYNC_RX_VHOST	1
+#define ASYNC_TX_VHOST	2
+
 struct dma_info {
 	struct rte_pci_addr addr;
 	uint16_t dev_id;
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..a5662a1a91 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -93,7 +93,8 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
+static int async_rx_vhost_driver;
+static int async_tx_vhost_driver;
 
 static char *dma_type;
 
@@ -671,13 +672,17 @@ us_vhost_parse_args(int argc, char **argv)
 			break;
 
 		case OPT_DMAS_NUM:
-			if (open_dma(optarg) == -1) {
+			ret = open_dma(optarg);
+			if (ret == -1) {
 				RTE_LOG(INFO, VHOST_CONFIG,
 					"Wrong DMA args\n");
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
+			if (ret & ASYNC_RX_VHOST)
+				async_rx_vhost_driver = 1;
+			if (ret & ASYNC_TX_VHOST)
+				async_tx_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -887,7 +892,7 @@ drain_vhost(struct vhost_dev *vdev)
 
 	if (builtin_net_driver) {
 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
+	} else if (async_rx_vhost_driver) {
 		uint32_t cpu_cpl_nr = 0;
 		uint16_t enqueue_fail = 0;
 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
@@ -914,7 +919,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if (!async_rx_vhost_driver)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1217,7 +1222,7 @@ drain_eth_rx(struct vhost_dev *vdev)
 	if (builtin_net_driver) {
 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
 						pkts, rx_count);
-	} else if (async_vhost_driver) {
+	} else if (async_rx_vhost_driver) {
 		uint32_t cpu_cpl_nr = 0;
 		uint16_t enqueue_fail = 0;
 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
@@ -1245,7 +1250,7 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if (!async_rx_vhost_driver)
 		free_pkts(pkts, rx_count);
 }
 
@@ -1259,6 +1264,12 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	if (builtin_net_driver) {
 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
 					pkts, MAX_PKT_BURST);
+	} else if (async_tx_vhost_driver) {
+		int nr_inflight;
+
+		count = rte_vhost_try_dequeue_burst(vdev->vid, VIRTIO_TXQ,
+				mbuf_pool, pkts, MAX_PKT_BURST, &nr_inflight);
+
 	} else {
 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
 					mbuf_pool, pkts, MAX_PKT_BURST);
@@ -1397,8 +1408,10 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (async_rx_vhost_driver)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (async_tx_vhost_driver)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
@@ -1467,24 +1480,29 @@ new_device(int vid)
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
 
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
 
-			f.async_inorder = 1;
-			f.async_threshold = 256;
+		f.async_inorder = 1;
+		f.async_threshold = 0;
 
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
+		if (async_rx_vhost_driver) {
+			ret = rte_vhost_async_channel_register(
+				vid, VIRTIO_RXQ, f.intval, &channel_ops);
+		}
+		if (async_tx_vhost_driver && (ret == 0)) {
+			ret = rte_vhost_async_channel_register(
+				vid, VIRTIO_TXQ, f.intval, &channel_ops);
 		}
 	}
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -1725,7 +1743,7 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
+		if (async_rx_vhost_driver || async_tx_vhost_driver)
 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
 
 		ret = rte_vhost_driver_register(file, flags);
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..0daf3e5576 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
 };
 
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
-};
+} __rte_cache_aligned;
 
 /**
  *  dma channel feature bit definition
@@ -193,4 +201,34 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the DMA engine. Successfully dequeued packets are
+ * transfer completed, either by the CPU or the DMA engine, and they are
+ * returned in "pkts". There may be other packets that are sent from
+ * the guest but being transferred by the DMA engine, called in-flight
+ * packets. The amount of in-flight packets by now is returned in
+ * "nr_inflight". This function will return in-flight packets only after
+ * the DMA engine finishes transferring.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets by now. If error occured, its
+ *  value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..2f82ab9713 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 7ed86e4e43..b7994892ad 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -3155,3 +3155,552 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		if (cpy_len >= cpy_threshold) {
+			async_fill_vec(src_iovec + tvec_idx,
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				(size_t)cpy_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(size_t)cpy_len);
+			tvec_idx++;
+			tlen += cpy_len;
+		} else if (likely(cpy_len > MAX_BATCH_LEN ||
+				vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m))) {
+			rte_memcpy(rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				cpy_len);
+		} else {
+			batch_copy[vq->batch_copy_nb_elems].dst =
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset);
+			batch_copy[vq->batch_copy_nb_elems].src =
+				(void *)((uintptr_t)(buf_addr + buf_offset));
+			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+			vq->batch_copy_nb_elems++;
+		}
+
+		mbuf_avail  -= cpy_len;
+		mbuf_offset += cpy_len;
+		buf_avail  -= cpy_len;
+		buf_offset += cpy_len;
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+
+		from = vq->last_async_desc_idx_split & (vq->size - 1);
+		update_shadow_used_ring_split(vq,
+				vq->async_descs_split[from].id, 0);
+		vq->last_async_desc_idx_split++;
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put)
+		vq->async_pkts_inflight_n -= n_pkts_put;
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	nr_done_pkts = async_poll_dequeue_completed_split(dev, vq, queue_id,
+						pkts, count, legacy_ol_flags);
+	if (unlikely(nr_done_pkts == count))
+		goto out;
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count - nr_done_pkts, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkt == NULL)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			rte_pktmbuf_free(pkt);
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+out:
+	if (likely(vq->shadow_used_idx)) {
+		flush_shadow_used_ring_split(dev, vq);
+		vhost_vring_call_split(dev, vq);
+	}
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH 1/1] lib/vhost: support async dequeue for split ring
  2021-06-02  8:31 ` [dpdk-dev] [PATCH 1/1] " Yuan Wang
@ 2021-06-07 16:17   ` Maxime Coquelin
  2021-06-09  1:21     ` Hu, Jiayu
  0 siblings, 1 reply; 50+ messages in thread
From: Maxime Coquelin @ 2021-06-07 16:17 UTC (permalink / raw)
  To: Yuan Wang, dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma, Jiayu Hu

Hi Yuan,

This is a first review, I will certainly have more comments later.

On 6/2/21 10:31 AM, Yuan Wang wrote:
> This patch implements asynchronous dequeue data path for split ring.
> A new asynchronous dequeue function is introduced. With this function,
> the application can try to receive packets from the guest with
> offloading large copies to the DMA engine, thus saving precious CPU
> cycles.

Do you have any number to share?

> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  10 +
>  examples/vhost/ioat.c               |  30 +-
>  examples/vhost/ioat.h               |   3 +
>  examples/vhost/main.c               |  60 +--
>  lib/vhost/rte_vhost_async.h         |  44 ++-
>  lib/vhost/version.map               |   3 +
>  lib/vhost/virtio_net.c              | 549 ++++++++++++++++++++++++++++
>  7 files changed, 664 insertions(+), 35 deletions(-)

Please split the patch in multple parts.
At least don't mix example and lib changes in the same patch.

> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index 6b7206bc1d..785ab0fb34 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
>    Poll enqueue completion status from async data path. Completed packets
>    are returned to applications through ``pkts``.
>  
> +* ``rte_vhost_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``

The function should contain async in its name.

BTW, I think we should also rename below APIs while they are
experimental to highlight it is async related:

rte_vhost_submit_enqueue_burst
rte_vhost_poll_enqueue_completed

> +
> +  Try to receive packets from the guest with offloading large packets
> +  to the DMA engine. Successfully dequeued packets are transfer
> +  completed and returned in ``pkts``. But there may be other packets
> +  that are sent from the guest but being transferred by the DMA engine,
> +  called in-flight packets. This function will return in-flight packets
> +  only after the DMA engine finishes transferring. The amount of
> +  in-flight packets by now is returned in ``nr_inflight``.
> +
>  Vhost-user Implementations
>  --------------------------
>  
> diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
> index 2a2c2d7202..236306c9c7 100644
> --- a/examples/vhost/ioat.c
> +++ b/examples/vhost/ioat.c
> @@ -17,7 +17,6 @@ struct packet_tracker {
>  	unsigned short next_read;
>  	unsigned short next_write;
>  	unsigned short last_remain;
> -	unsigned short ioat_space;
>  };
>  
>  struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
> @@ -61,18 +60,30 @@ open_ioat(const char *value)
>  		goto out;
>  	}
>  	while (i < args_nr) {
> +		char *txd, *rxd;
> +		bool is_txd;
>  		char *arg_temp = dma_arg[i];
>  		uint8_t sub_nr;
> +
>  		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
>  		if (sub_nr != 2) {
>  			ret = -1;
>  			goto out;
>  		}
>  
> -		start = strstr(ptrs[0], "txd");
> -		if (start == NULL) {
> +		txd = strstr(ptrs[0], "txd");
> +		rxd = strstr(ptrs[0], "rxd");
> +		if (txd == NULL && rxd == NULL) {
>  			ret = -1;
>  			goto out;
> +		} else if (txd) {
> +			is_txd = true;
> +			start = txd;
> +			ret |= ASYNC_RX_VHOST;
> +		} else {
> +			is_txd = false;
> +			start = rxd;
> +			ret |= ASYNC_TX_VHOST;
>  		}
>  
>  		start += 3;
> @@ -82,7 +93,8 @@ open_ioat(const char *value)
>  			goto out;
>  		}
>  
> -		vring_id = 0 + VIRTIO_RXQ;
> +		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
> +
>  		if (rte_pci_addr_parse(ptrs[1],
>  				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
>  			ret = -1;
> @@ -113,7 +125,6 @@ open_ioat(const char *value)
>  			goto out;
>  		}
>  		rte_rawdev_start(dev_id);
> -		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
>  		dma_info->nr++;
>  		i++;
>  	}
> @@ -128,7 +139,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>  		struct rte_vhost_async_status *opaque_data, uint16_t count)
>  {
>  	uint32_t i_desc;
> -	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
> +	uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;

It looks broken with regards to multiqueue (it was before this patch).

In open_ioat(), only dma_bind[vid].dmas[VIRTIO_RXQ] and
dma_bind[vid].dmas[VIRTIO_TXQ] are set.

As it seems that the application does not support multiqueue, it may be
a good idea to check queue_id value before using it.

>  	struct rte_vhost_iov_iter *src = NULL;
>  	struct rte_vhost_iov_iter *dst = NULL;
>  	unsigned long i_seg;
> @@ -140,7 +151,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>  			src = descs[i_desc].src;
>  			dst = descs[i_desc].dst;
>  			i_seg = 0;
> -			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
> +			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)

This change should be in a dedicated patch, it is not related to dequeue
support.

>  				break;
>  			while (i_seg < src->nr_segs) {
>  				rte_ioat_enqueue_copy(dev_id,
> @@ -155,7 +166,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>  			}
>  			write &= mask;
>  			cb_tracker[dev_id].size_track[write] = src->nr_segs;
> -			cb_tracker[dev_id].ioat_space -= src->nr_segs;
>  			write++;
>  		}
>  	} else {
> @@ -181,8 +191,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
>  		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
>  		unsigned short i;
>  
> -		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
> -				+ VIRTIO_RXQ].dev_id;
> +		uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
>  		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
>  		if (n_seg < 0) {
>  			RTE_LOG(ERR,
> @@ -194,7 +203,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
>  		if (n_seg == 0)
>  			return 0;
>  
> -		cb_tracker[dev_id].ioat_space += n_seg;
>  		n_seg += cb_tracker[dev_id].last_remain;
>  
>  		read = cb_tracker[dev_id].next_read;
> diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
> index 1aa28ed6a3..db7acefc02 100644
> --- a/examples/vhost/ioat.h
> +++ b/examples/vhost/ioat.h
> @@ -13,6 +13,9 @@
>  #define IOAT_RING_SIZE 4096
>  #define MAX_ENQUEUED_SIZE 4096
>  
> +#define ASYNC_RX_VHOST	1
> +#define ASYNC_TX_VHOST	2
> +
>  struct dma_info {
>  	struct rte_pci_addr addr;
>  	uint16_t dev_id;
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c
> index d2179eadb9..a5662a1a91 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -93,7 +93,8 @@ static int client_mode;
>  
>  static int builtin_net_driver;
>  
> -static int async_vhost_driver;
> +static int async_rx_vhost_driver;
> +static int async_tx_vhost_driver;
>  
>  static char *dma_type;
>  
> @@ -671,13 +672,17 @@ us_vhost_parse_args(int argc, char **argv)
>  			break;
>  
>  		case OPT_DMAS_NUM:
> -			if (open_dma(optarg) == -1) {
> +			ret = open_dma(optarg);
> +			if (ret == -1) {
>  				RTE_LOG(INFO, VHOST_CONFIG,
>  					"Wrong DMA args\n");
>  				us_vhost_usage(prgname);
>  				return -1;
>  			}
> -			async_vhost_driver = 1;
> +			if (ret & ASYNC_RX_VHOST)
> +				async_rx_vhost_driver = 1;
> +			if (ret & ASYNC_TX_VHOST)
> +				async_tx_vhost_driver = 1;
>  			break;
>  
>  		case OPT_CLIENT_NUM:
> @@ -887,7 +892,7 @@ drain_vhost(struct vhost_dev *vdev)
>  
>  	if (builtin_net_driver) {
>  		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
> -	} else if (async_vhost_driver) {
> +	} else if (async_rx_vhost_driver) {

I think we should consider having ops for async and sync instead of all
these if/else. It could be refactored as preliminary patch for this
series.

>  		uint32_t cpu_cpl_nr = 0;
>  		uint16_t enqueue_fail = 0;
>  		struct rte_mbuf *m_cpu_cpl[nr_xmit];
> @@ -914,7 +919,7 @@ drain_vhost(struct vhost_dev *vdev)
>  				__ATOMIC_SEQ_CST);
>  	}
>  
> -	if (!async_vhost_driver)
> +	if (!async_rx_vhost_driver)
>  		free_pkts(m, nr_xmit);
>  }
>  


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH 1/1] lib/vhost: support async dequeue for split ring
  2021-06-07 16:17   ` Maxime Coquelin
@ 2021-06-09  1:21     ` Hu, Jiayu
  0 siblings, 0 replies; 50+ messages in thread
From: Hu, Jiayu @ 2021-06-09  1:21 UTC (permalink / raw)
  To: Maxime Coquelin, Wang, YuanX, dev
  Cc: maxime.coquelin, Xia, Chenbo, Jiang, Cheng1, Ma, WenwuX

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <mcoqueli@redhat.com>
> Sent: Tuesday, June 8, 2021 12:17 AM
> To: Wang, YuanX <yuanx.wang@intel.com>; dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>;
> Jiang, Cheng1 <cheng1.jiang@intel.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> Subject: Re: [PATCH 1/1] lib/vhost: support async dequeue for split ring
> 
> Hi Yuan,
> 
> This is a first review, I will certainly have more comments later.
> 
> On 6/2/21 10:31 AM, Yuan Wang wrote:
> > This patch implements asynchronous dequeue data path for split ring.
> > A new asynchronous dequeue function is introduced. With this function,
> > the application can try to receive packets from the guest with
> > offloading large copies to the DMA engine, thus saving precious CPU
> > cycles.
> 
> Do you have any number to share?

We cannot share the exact numbers without legal approval.
There are some relative values:
in PV cases, testpmd macfwd throughput of one core with async
dequeue enabled is around 1.25x of SW vhost; when enabling
both async enqueue and dequeue, one core macfwd throughput
is up to 2x of SW vhost.

> 
> > Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > ---
> >  doc/guides/prog_guide/vhost_lib.rst |  10 +
> >  examples/vhost/ioat.c               |  30 +-
> >  examples/vhost/ioat.h               |   3 +
> >  examples/vhost/main.c               |  60 +--
> >  lib/vhost/rte_vhost_async.h         |  44 ++-
> >  lib/vhost/version.map               |   3 +
> >  lib/vhost/virtio_net.c              | 549 ++++++++++++++++++++++++++++
> >  7 files changed, 664 insertions(+), 35 deletions(-)
> 
> Please split the patch in multple parts.
> At least don't mix example and lib changes in the same patch.
> 
> > diff --git a/doc/guides/prog_guide/vhost_lib.rst
> b/doc/guides/prog_guide/vhost_lib.rst
> > index 6b7206bc1d..785ab0fb34 100644
> > --- a/doc/guides/prog_guide/vhost_lib.rst
> > +++ b/doc/guides/prog_guide/vhost_lib.rst
> > @@ -281,6 +281,16 @@ The following is an overview of some key Vhost
> API functions:
> >    Poll enqueue completion status from async data path. Completed packets
> >    are returned to applications through ``pkts``.
> >
> > +* ``rte_vhost_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count,
> nr_inflight)``
> 
> The function should contain async in its name.
> 
> BTW, I think we should also rename below APIs while they are
> experimental to highlight it is async related:
> 
> rte_vhost_submit_enqueue_burst
> rte_vhost_poll_enqueue_completed

Yes, it's better to add "async" in related functions. How about:
rte_vhost_async_submit_enqueue_burst
rte_vhost_async_poll_enqueue_completed
rte_vhost_async_try_dequeue_burst

Any suggestions?

Thanks,
Jiayu

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/4] vhost: support async dequeue for split ring
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
@ 2021-06-18 14:10   ` Maxime Coquelin
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 50+ messages in thread
From: Maxime Coquelin @ 2021-06-18 14:10 UTC (permalink / raw)
  To: Wenwu Ma, dev; +Cc: chenbo.xia, cheng1.jiang

Hi,

It seems the series does not build:
http://mails.dpdk.org/archives/test-report/2021-June/199414.html

Could you please look at it?

Thanks,
Maxime

On 6/18/21 10:03 PM, Wenwu Ma wrote:
> This patch implements asynchronous dequeue data path for split ring.
> A new asynchronous dequeue function is introduced. With this function,
> the application can try to receive packets from the guest with
> offloading large copies to the DMA engine, thus saving precious CPU
> cycles.
> 
> v2:
> - Refactor vhost datapath as preliminary patch for this series.
> - The change of using new API in examples/vhost is put into a
>   dedicated patch.
> - Check queue_id value before using it.
> - Async dequeue performance enhancement. 160% performance improvement
>   for v2 vs. v1.
> - Async dequeue API name change from rte_vhost_try_dequeue_burst to
>   rte_vhost_async_try_dequeue_burst.
> - The completed package updates the used ring directly.
> 
> Wenwu Ma (3):
>   examples/vhost: refactor vhost enqueue and dequeue datapaths.
>   examples/vhost: use a new API to query remaining ring space
>   examples/vhost: support vhost async dequeue data path
> 
> Yuan Wang (1):
>   vhost: support async dequeue for split ring
> 
>  doc/guides/prog_guide/vhost_lib.rst |  10 +
>  doc/guides/sample_app_ug/vhost.rst  |   9 +-
>  examples/vhost/ioat.c               |  40 +-
>  examples/vhost/ioat.h               |   4 +
>  examples/vhost/main.c               | 225 +++++++----
>  examples/vhost/main.h               |  38 +-
>  examples/vhost/virtio_net.c         |  16 +-
>  lib/vhost/rte_vhost_async.h         |  44 ++-
>  lib/vhost/version.map               |   3 +
>  lib/vhost/virtio_net.c              | 553 ++++++++++++++++++++++++++++
>  10 files changed, 839 insertions(+), 103 deletions(-)
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 0/4] vhost: support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
  2021-06-02  8:31 ` [dpdk-dev] [PATCH 1/1] " Yuan Wang
@ 2021-06-18 20:03 ` Wenwu Ma
  2021-06-18 14:10   ` Maxime Coquelin
                     ` (4 more replies)
  2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
                   ` (4 subsequent siblings)
  6 siblings, 5 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-18 20:03 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

v2:
- Refactor vhost datapath as preliminary patch for this series.
- The change of using new API in examples/vhost is put into a
  dedicated patch.
- Check queue_id value before using it.
- Async dequeue performance enhancement. 160% performance improvement
  for v2 vs. v1.
- Async dequeue API name change from rte_vhost_try_dequeue_burst to
  rte_vhost_async_try_dequeue_burst.
- The completed package updates the used ring directly.

Wenwu Ma (3):
  examples/vhost: refactor vhost enqueue and dequeue datapaths.
  examples/vhost: use a new API to query remaining ring space
  examples/vhost: support vhost async dequeue data path

Yuan Wang (1):
  vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |  10 +
 doc/guides/sample_app_ug/vhost.rst  |   9 +-
 examples/vhost/ioat.c               |  40 +-
 examples/vhost/ioat.h               |   4 +
 examples/vhost/main.c               | 225 +++++++----
 examples/vhost/main.h               |  38 +-
 examples/vhost/virtio_net.c         |  16 +-
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 553 ++++++++++++++++++++++++++++
 10 files changed, 839 insertions(+), 103 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths.
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
  2021-06-18 14:10   ` Maxime Coquelin
@ 2021-06-18 20:03   ` Wenwu Ma
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-18 20:03 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

Previously, by judging the flag, we call different enqueue/dequeue
functions in data path.

Now, we use an ops that was initialized when Vhost was created,
so that we can call ops directly in Vhost data path without any more
flag judgment.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
 examples/vhost/main.h       |  33 +++++++++--
 examples/vhost/virtio_net.c |  16 +++++-
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..aebdc3a566 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
 static char *socket_files;
 static int nb_sockets;
 
+static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE];
+
 /* empty vmdq configuration structure. Filled in programatically */
 static struct rte_eth_conf vmdq_conf_default = {
 	.rxmode = {
@@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev)
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
 
-	if (builtin_net_driver) {
-		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[nr_xmit];
-
-		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
-
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = nr_xmit - ret;
-		if (enqueue_fail)
-			free_pkts(&m[ret], nr_xmit - ret);
-	} else {
-		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						m, nr_xmit);
-	}
+	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+					VIRTIO_RXQ, m, nr_xmit);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
@@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 }
 
+uint16_t
+async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	uint16_t enqueue_count;
+	uint32_t cpu_cpl_nr = 0;
+	uint16_t enqueue_fail = 0;
+	struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
+
+	complete_async_pkts(vdev);
+	enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
+				queue_id, pkts, rx_count,
+				m_cpu_cpl, &cpu_cpl_nr);
+	if (cpu_cpl_nr)
+		free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+	enqueue_fail = rx_count - enqueue_count;
+	if (enqueue_fail)
+		free_pkts(&pkts[enqueue_count], enqueue_fail);
+
+	return enqueue_count;
+}
+
+uint16_t
+sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count);
+}
+
 static __rte_always_inline void
 drain_eth_rx(struct vhost_dev *vdev)
 {
@@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev)
 		}
 	}
 
-	if (builtin_net_driver) {
-		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
-						pkts, rx_count);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
-
-		complete_async_pkts(vdev);
-		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count,
-					m_cpu_cpl, &cpu_cpl_nr);
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = rx_count - enqueue_count;
-		if (enqueue_fail)
-			free_pkts(&pkts[enqueue_count], enqueue_fail);
-
-	} else {
-		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
-	}
+	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+						VIRTIO_RXQ, pkts, rx_count);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
@@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count)
+{
+	return rte_vhost_dequeue_burst(dev->vid, queue_id,
+					mbuf_pool, pkts, count);
+}
+
 static __rte_always_inline void
 drain_virtio_tx(struct vhost_dev *vdev)
 {
@@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	if (builtin_net_driver) {
-		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
-					pkts, MAX_PKT_BURST);
-	} else {
-		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
-	}
+	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
+				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
@@ -1436,6 +1431,21 @@ new_device(int vid)
 		}
 	}
 
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (async_vhost_driver) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							sync_enqueue_pkts;
+		}
+
+		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
+	}
+
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
 
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 0ccdce4b4a..7cd8a11a45 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -60,6 +60,19 @@ struct vhost_dev {
 	struct vhost_queue queues[MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;
 
+typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mbuf **pkts,
+			uint32_t count);
+
+typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+
+struct vhost_queue_ops {
+	vhost_enqueue_burst_t enqueue_pkt_burst;
+	vhost_dequeue_burst_t dequeue_pkt_burst;
+};
+
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
 
 
@@ -84,9 +97,21 @@ struct lcore_info {
 void vs_vhost_net_setup(struct vhost_dev *dev);
 void vs_vhost_net_remove(struct vhost_dev *dev);
 uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+
+uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			 struct rte_mbuf **pkts, uint32_t count);
-
-uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
-			 struct rte_mempool *mbuf_pool,
-			 struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			 struct rte_mbuf **pkts, uint32_t count);
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c
index 9064fc3a82..2432a96566 100644
--- a/examples/vhost/virtio_net.c
+++ b/examples/vhost/virtio_net.c
@@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	return count;
 }
 
+uint16_t
+builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t count)
+{
+	return vs_enqueue_pkts(dev, queue_id, pkts, count);
+}
+
 static __rte_always_inline int
 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	    struct rte_mbuf *m, uint16_t desc_idx,
@@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	return 0;
 }
 
-uint16_t
+static uint16_t
 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
@@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 
 	return i;
 }
+
+uint16_t
+builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+	return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] examples/vhost: use a new API to query remaining ring space
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
  2021-06-18 14:10   ` Maxime Coquelin
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-06-18 20:03   ` Wenwu Ma
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 3/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-18 20:03 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

A new API for querying the remaining descriptor ring capacity
is available, so we use the new one instead of the old one.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/ioat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..bf4e033bdb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +112,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] vhost: support async dequeue for split ring
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
                     ` (2 preceding siblings ...)
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-06-18 20:03   ` Wenwu Ma
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-18 20:03 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Yuan Wang, Jiayu Hu, Wenwu Ma

From: Yuan Wang <yuanx.wang@intel.com>

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  10 +
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 553 ++++++++++++++++++++++++++++
 4 files changed, 607 insertions(+), 3 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..05c42c9b11 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  Try to receive packets from the guest with offloading large packets
+  to the DMA engine. Successfully dequeued packets are transfer
+  completed and returned in ``pkts``. But there may be other packets
+  that are sent from the guest but being transferred by the DMA engine,
+  called in-flight packets. This function will return in-flight packets
+  only after the DMA engine finishes transferring. The amount of
+  in-flight packets by now is returned in ``nr_inflight``.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..58019408f1 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
 };
 
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
-};
+} __rte_cache_aligned;
 
 /**
  *  dma channel feature bit definition
@@ -193,4 +201,34 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the DMA engine. Successfully dequeued packets are
+ * transfer completed, either by the CPU or the DMA engine, and they are
+ * returned in "pkts". There may be other packets that are sent from
+ * the guest but being transferred by the DMA engine, called in-flight
+ * packets. The amount of in-flight packets by now is returned in
+ * "nr_inflight". This function will return in-flight packets only after
+ * the DMA engine finishes transferring.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets by now. If error occurred, its
+ *  value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..a320f889cd 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_async_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 8da8a86a10..28e54df804 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -3165,3 +3165,556 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		if (cpy_len >= cpy_threshold) {
+			async_fill_vec(src_iovec + tvec_idx,
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				(size_t)cpy_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(size_t)cpy_len);
+			tvec_idx++;
+			tlen += cpy_len;
+		} else if (likely(cpy_len > MAX_BATCH_LEN ||
+				vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m))) {
+			rte_memcpy(rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				cpy_len);
+		} else {
+			batch_copy[vq->batch_copy_nb_elems].dst =
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset);
+			batch_copy[vq->batch_copy_nb_elems].src =
+				(void *)((uintptr_t)(buf_addr + buf_offset));
+			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+			vq->batch_copy_nb_elems++;
+		}
+
+		mbuf_avail  -= cpy_len;
+		mbuf_offset += cpy_len;
+		buf_avail  -= cpy_len;
+		buf_offset += cpy_len;
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put) {
+		/* write back completed descs to used ring */
+		write_back_completed_descs_split(vq, n_pkts_put);
+		/* update used ring */
+		__atomic_add_fetch(&vq->used->idx,
+				n_pkts_put, __ATOMIC_RELEASE);
+
+		vq->async_pkts_inflight_n -= n_pkts_put;
+	}
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_aysnc_cmpl_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkt == NULL)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			rte_pktmbuf_free(pkt);
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+	if (likely(vq->shadow_used_idx))
+		flush_shadow_used_ring_split(dev, vq);
+
+out:
+	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
+		nr_aysnc_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
+					queue_id, pkts, count - nr_done_pkts,
+					legacy_ol_flags);
+		nr_done_pkts += nr_aysnc_cmpl_pkts;
+	}
+	if (likely(nr_done_pkts))
+		vhost_vring_call_split(dev, vq);
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] examples/vhost: support vhost async dequeue data path
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
                     ` (3 preceding siblings ...)
  2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-06-18 20:03   ` Wenwu Ma
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-18 20:03 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

This patch is to add vhost async dequeue data-path in vhost sample.
vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |   9 +-
 examples/vhost/ioat.c              |  34 +++++--
 examples/vhost/ioat.h              |   4 +
 examples/vhost/main.c              | 141 ++++++++++++++++++++---------
 examples/vhost/main.h              |   5 +
 5 files changed, 140 insertions(+), 53 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 9afde9c7f5..63dcf181e1 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
 **--dmas**
 This parameter is used to specify the assigned DMA device of a vhost device.
 Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
-device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
-enqueue operation.
+--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
+and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
+operation. The index of the device corresponds to the socket file in order,
+that means vhost device 0 is created through the first socket file, vhost
+device 1 is created through the second socket file, and so on.
 
 Common Issues
 -------------
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index bf4e033bdb..179ae87deb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -21,6 +21,8 @@ struct packet_tracker {
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
 
+int vid2txd[MAX_VHOST_DEVICE];
+
 int
 open_ioat(const char *value)
 {
@@ -60,6 +62,8 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		char *txd, *rxd;
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
@@ -68,10 +72,20 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		int async_flag;
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd == NULL && rxd == NULL) {
 			ret = -1;
 			goto out;
+		} else if (txd) {
+			is_txd = true;
+			start = txd;
+			async_flag = ASYNC_RX_VHOST;
+		} else {
+			is_txd = false;
+			start = rxd;
+			async_flag = ASYNC_TX_VHOST;
 		}
 
 		start += 3;
@@ -81,7 +95,8 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
 				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
@@ -105,6 +120,7 @@ open_ioat(const char *value)
 
 		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
 		(dma_info + vid)->dmas[vring_id].is_valid = true;
+		(dma_info + vid)->async_flag |= async_flag;
 		config.ring_size = IOAT_RING_SIZE;
 		config.hdls_disable = true;
 		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
@@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
 	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
 
+	if (queue_id >= MAX_RING_COUNT)
+		return -1;
+
+	uint16_t dev_id = dma_bind[vid2txd[vid]].dmas[queue_id].dev_id;
+	unsigned short write = cb_tracker[dev_id].next_write;
 	if (!opaque_data) {
 		for (i_desc = 0; i_desc < count; i_desc++) {
 			src = descs[i_desc].src;
@@ -170,7 +189,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets)
 {
-	if (!opaque_data) {
+	if (!opaque_data && (queue_id < MAX_RING_COUNT)) {
 		uintptr_t dump[255];
 		int n_seg;
 		unsigned short read, write;
@@ -178,8 +197,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		uint16_t dev_id = dma_bind[vid2txd[vid]].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..c3d5c2344a 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -12,6 +12,9 @@
 #define MAX_VHOST_DEVICE 1024
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
+#define MAX_RING_COUNT	2
+#define ASYNC_RX_VHOST	1
+#define ASYNC_TX_VHOST	2
 
 struct dma_info {
 	struct rte_pci_addr addr;
@@ -20,6 +23,7 @@ struct dma_info {
 };
 
 struct dma_for_vhost {
+	int async_flag;
 	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
 	uint16_t nr;
 };
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index aebdc3a566..8a65e525ff 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -25,7 +25,6 @@
 #include <rte_tcp.h>
 #include <rte_pause.h>
 
-#include "ioat.h"
 #include "main.h"
 
 #ifndef MAX_QUEUES
@@ -93,8 +92,6 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
-
 static char *dma_type;
 
 /* Specify timeout (in useconds) between retries on RX. */
@@ -679,7 +676,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -897,7 +893,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((dma_bind[vid2txd[vdev->vid]].async_flag & ASYNC_RX_VHOST) == 0)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1237,10 +1233,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((dma_bind[vid2txd[vdev->vid]].async_flag & ASYNC_RX_VHOST) == 0)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+				struct rte_mempool *mbuf_pool,
+				struct rte_mbuf **pkts, uint16_t count)
+{
+	int nr_inflight;
+	return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
+			mbuf_pool, pkts, count, &nr_inflight);
+}
+
 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			struct rte_mempool *mbuf_pool,
 			struct rte_mbuf **pkts, uint16_t count)
@@ -1392,12 +1397,90 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (dma_bind[vid2txd[vid]].async_flag & ASYNC_RX_VHOST)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (dma_bind[vid2txd[vid]].async_flag & ASYNC_TX_VHOST)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
 
+static int
+get_txd_id(int vid)
+{
+	int i;
+	char ifname[PATH_MAX];
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
+
+	for (i = 0; i < nb_sockets; i++) {
+		char *file = socket_files + i * PATH_MAX;
+		if (strcmp(file, ifname) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int
+init_vhost_queue_ops(int vid)
+{
+	int i = get_txd_id(vid);
+	if (i == -1)
+		return -1;
+
+	vid2txd[vid] = i;
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (dma_bind[i].async_flag & ASYNC_RX_VHOST) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						sync_enqueue_pkts;
+		}
+
+		if (dma_bind[i].async_flag & ASYNC_TX_VHOST) {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						async_dequeue_pkts;
+		} else {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						sync_dequeue_pkts;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_aysnc_channel_register(int vid)
+{
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
+
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
+
+		f.async_inorder = 1;
+		f.async_threshold = 256;
+
+		if (dma_bind[vid2txd[vid]].async_flag & ASYNC_RX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
+					f.intval, &channel_ops);
+		}
+		if (dma_bind[vid2txd[vid]].async_flag & ASYNC_TX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ,
+					f.intval, &channel_ops);
+		}
+	}
+
+	return ret;
+}
+
 /*
  * A new device is added to a data core. First the device is added to the main linked list
  * and then allocated to a specific data core.
@@ -1431,20 +1514,8 @@ new_device(int vid)
 		}
 	}
 
-	if (builtin_net_driver) {
-		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
-		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
-	} else {
-		if (async_vhost_driver) {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							async_enqueue_pkts;
-		} else {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							sync_enqueue_pkts;
-		}
-
-		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
-	}
+	if (init_vhost_queue_ops(vid) != 0)
+		return -1;
 
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
@@ -1473,28 +1544,13 @@ new_device(int vid)
 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
+	int ret = vhost_aysnc_channel_register(vid);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
-
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			f.async_inorder = 1;
-			f.async_threshold = 256;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
-		}
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1735,10 +1791,11 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
-			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
+		uint64_t flag = flags;
+		if (dma_bind[i].async_flag != 0)
+			flag |= RTE_VHOST_USER_ASYNC_COPY;
 
-		ret = rte_vhost_driver_register(file, flags);
+		ret = rte_vhost_driver_register(file, flag);
 		if (ret != 0) {
 			unregister_drivers(i);
 			rte_exit(EXIT_FAILURE,
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 7cd8a11a45..5a892ed08d 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -9,6 +9,8 @@
 
 #include <rte_ether.h>
 
+#include "ioat.h"
+
 /* Macros for printing using RTE_LOG */
 #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
 #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER2
@@ -18,6 +20,9 @@ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
 
 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
 
+extern struct dma_for_vhost dma_bind[MAX_VHOST_DEVICE];
+extern int vid2txd[MAX_VHOST_DEVICE];
+
 struct device_statistics {
 	uint64_t	tx;
 	uint64_t	tx_total;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
  2021-06-02  8:31 ` [dpdk-dev] [PATCH 1/1] " Yuan Wang
  2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
@ 2021-06-23 15:00 ` Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
                     ` (3 more replies)
  2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
                   ` (3 subsequent siblings)
  6 siblings, 4 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-23 15:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

v3:
- Fix compilation warning and error in arm platform.
- Restore the removed function virtio_dev_pktmbuf_alloc,
  async dequeue allocate packets in separate.

v2:
- Refactor vhost datapath as preliminary patch for this series.
- The change of using new API in examples/vhost is put into a
  dedicated patch.
- Check queue_id value before using it.
- Async dequeue performance enhancement. 160% performance improvement
  for v2 vs. v1.
- Async dequeue API name change from rte_vhost_try_dequeue_burst to
  rte_vhost_async_try_dequeue_burst.
- The completed package updates the used ring directly.

Wenwu Ma (3):
  examples/vhost: refactor vhost enqueue and dequeue datapaths.
  examples/vhost: use a new API to query remaining ring space
  examples/vhost: support vhost async dequeue data path

Yuan Wang (1):
  vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |  10 +
 doc/guides/sample_app_ug/vhost.rst  |   9 +-
 examples/vhost/ioat.c               |  67 +++-
 examples/vhost/ioat.h               |  25 ++
 examples/vhost/main.c               | 224 +++++++----
 examples/vhost/main.h               |  33 +-
 examples/vhost/virtio_net.c         |  16 +-
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 579 ++++++++++++++++++++++++++++
 10 files changed, 902 insertions(+), 108 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths.
  2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-06-23 15:00   ` Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-23 15:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

Previously, by judging the flag, we call different enqueue/dequeue
functions in data path.

Now, we use an ops that was initialized when Vhost was created,
so that we can call ops directly in Vhost data path without any more
flag judgment.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
 examples/vhost/main.h       |  33 +++++++++--
 examples/vhost/virtio_net.c |  16 +++++-
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..aebdc3a566 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
 static char *socket_files;
 static int nb_sockets;
 
+static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE];
+
 /* empty vmdq configuration structure. Filled in programatically */
 static struct rte_eth_conf vmdq_conf_default = {
 	.rxmode = {
@@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev)
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
 
-	if (builtin_net_driver) {
-		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[nr_xmit];
-
-		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
-
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = nr_xmit - ret;
-		if (enqueue_fail)
-			free_pkts(&m[ret], nr_xmit - ret);
-	} else {
-		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						m, nr_xmit);
-	}
+	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+					VIRTIO_RXQ, m, nr_xmit);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
@@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 }
 
+uint16_t
+async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	uint16_t enqueue_count;
+	uint32_t cpu_cpl_nr = 0;
+	uint16_t enqueue_fail = 0;
+	struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
+
+	complete_async_pkts(vdev);
+	enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
+				queue_id, pkts, rx_count,
+				m_cpu_cpl, &cpu_cpl_nr);
+	if (cpu_cpl_nr)
+		free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+	enqueue_fail = rx_count - enqueue_count;
+	if (enqueue_fail)
+		free_pkts(&pkts[enqueue_count], enqueue_fail);
+
+	return enqueue_count;
+}
+
+uint16_t
+sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count);
+}
+
 static __rte_always_inline void
 drain_eth_rx(struct vhost_dev *vdev)
 {
@@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev)
 		}
 	}
 
-	if (builtin_net_driver) {
-		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
-						pkts, rx_count);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
-
-		complete_async_pkts(vdev);
-		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count,
-					m_cpu_cpl, &cpu_cpl_nr);
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = rx_count - enqueue_count;
-		if (enqueue_fail)
-			free_pkts(&pkts[enqueue_count], enqueue_fail);
-
-	} else {
-		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
-	}
+	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+						VIRTIO_RXQ, pkts, rx_count);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
@@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count)
+{
+	return rte_vhost_dequeue_burst(dev->vid, queue_id,
+					mbuf_pool, pkts, count);
+}
+
 static __rte_always_inline void
 drain_virtio_tx(struct vhost_dev *vdev)
 {
@@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	if (builtin_net_driver) {
-		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
-					pkts, MAX_PKT_BURST);
-	} else {
-		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
-	}
+	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
+				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
@@ -1436,6 +1431,21 @@ new_device(int vid)
 		}
 	}
 
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (async_vhost_driver) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							sync_enqueue_pkts;
+		}
+
+		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
+	}
+
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
 
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 0ccdce4b4a..7cd8a11a45 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -60,6 +60,19 @@ struct vhost_dev {
 	struct vhost_queue queues[MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;
 
+typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mbuf **pkts,
+			uint32_t count);
+
+typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+
+struct vhost_queue_ops {
+	vhost_enqueue_burst_t enqueue_pkt_burst;
+	vhost_dequeue_burst_t dequeue_pkt_burst;
+};
+
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
 
 
@@ -84,9 +97,21 @@ struct lcore_info {
 void vs_vhost_net_setup(struct vhost_dev *dev);
 void vs_vhost_net_remove(struct vhost_dev *dev);
 uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+
+uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			 struct rte_mbuf **pkts, uint32_t count);
-
-uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
-			 struct rte_mempool *mbuf_pool,
-			 struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			 struct rte_mbuf **pkts, uint32_t count);
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c
index 9064fc3a82..2432a96566 100644
--- a/examples/vhost/virtio_net.c
+++ b/examples/vhost/virtio_net.c
@@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	return count;
 }
 
+uint16_t
+builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t count)
+{
+	return vs_enqueue_pkts(dev, queue_id, pkts, count);
+}
+
 static __rte_always_inline int
 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	    struct rte_mbuf *m, uint16_t desc_idx,
@@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	return 0;
 }
 
-uint16_t
+static uint16_t
 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
@@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 
 	return i;
 }
+
+uint16_t
+builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+	return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 2/4] examples/vhost: use a new API to query remaining ring space
  2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-06-23 15:00   ` Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 3/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-23 15:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

A new API for querying the remaining descriptor ring capacity
is available, so we use the new one instead of the old one.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/ioat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..bf4e033bdb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +112,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 3/4] vhost: support async dequeue for split ring
  2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-06-23 15:00   ` Wenwu Ma
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-23 15:00 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Yuan Wang, Jiayu Hu, Wenwu Ma

From: Yuan Wang <yuanx.wang@intel.com>

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  10 +
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 579 ++++++++++++++++++++++++++++
 4 files changed, 633 insertions(+), 3 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..05c42c9b11 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  Try to receive packets from the guest with offloading large packets
+  to the DMA engine. Successfully dequeued packets are transfer
+  completed and returned in ``pkts``. But there may be other packets
+  that are sent from the guest but being transferred by the DMA engine,
+  called in-flight packets. This function will return in-flight packets
+  only after the DMA engine finishes transferring. The amount of
+  in-flight packets by now is returned in ``nr_inflight``.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..58019408f1 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
 };
 
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
-};
+} __rte_cache_aligned;
 
 /**
  *  dma channel feature bit definition
@@ -193,4 +201,34 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the DMA engine. Successfully dequeued packets are
+ * transfer completed, either by the CPU or the DMA engine, and they are
+ * returned in "pkts". There may be other packets that are sent from
+ * the guest but being transferred by the DMA engine, called in-flight
+ * packets. The amount of in-flight packets by now is returned in
+ * "nr_inflight". This function will return in-flight packets only after
+ * the DMA engine finishes transferring.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets by now. If error occurred, its
+ *  value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..a320f889cd 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_async_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b93482587c..89a6715e7a 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -2673,6 +2673,32 @@ virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
 	return -1;
 }
 
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint32_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL)) {
+		VHOST_LOG_DATA(ERR,
+			"Failed to allocate memory for mbuf.\n");
+		return NULL;
+	}
+
+	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
+		/* Data doesn't fit into the buffer and the host supports
+		 * only linear buffers
+		 */
+		rte_pktmbuf_free(pkt);
+		return NULL;
+	}
+
+	return pkt;
+}
+
 __rte_always_inline
 static uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
@@ -3147,3 +3173,556 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		if (cpy_len >= cpy_threshold) {
+			async_fill_vec(src_iovec + tvec_idx,
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				(size_t)cpy_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(size_t)cpy_len);
+			tvec_idx++;
+			tlen += cpy_len;
+		} else if (likely(cpy_len > MAX_BATCH_LEN ||
+				vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m))) {
+			rte_memcpy(rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				cpy_len);
+		} else {
+			batch_copy[vq->batch_copy_nb_elems].dst =
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset);
+			batch_copy[vq->batch_copy_nb_elems].src =
+				(void *)((uintptr_t)(buf_addr + buf_offset));
+			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+			vq->batch_copy_nb_elems++;
+		}
+
+		mbuf_avail  -= cpy_len;
+		mbuf_offset += cpy_len;
+		buf_avail  -= cpy_len;
+		buf_offset += cpy_len;
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put) {
+		/* write back completed descs to used ring */
+		write_back_completed_descs_split(vq, n_pkts_put);
+		/* update used ring */
+		__atomic_add_fetch(&vq->used->idx,
+				n_pkts_put, __ATOMIC_RELEASE);
+
+		vq->async_pkts_inflight_n -= n_pkts_put;
+	}
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkt == NULL)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			rte_pktmbuf_free(pkt);
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+	if (likely(vq->shadow_used_idx))
+		flush_shadow_used_ring_split(dev, vq);
+
+out:
+	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
+		nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
+					queue_id, pkts, count - nr_done_pkts,
+					legacy_ol_flags);
+		nr_done_pkts += nr_async_cmpl_pkts;
+	}
+	if (likely(nr_done_pkts))
+		vhost_vring_call_split(dev, vq);
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 4/4] examples/vhost: support vhost async dequeue data path
  2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
                     ` (2 preceding siblings ...)
  2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-06-23 15:00   ` Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-23 15:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, Wenwu Ma

This patch is to add vhost async dequeue data-path in vhost sample.
vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |   9 +-
 examples/vhost/ioat.c              |  61 ++++++++++---
 examples/vhost/ioat.h              |  25 ++++++
 examples/vhost/main.c              | 140 ++++++++++++++++++++---------
 4 files changed, 177 insertions(+), 58 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 9afde9c7f5..63dcf181e1 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
 **--dmas**
 This parameter is used to specify the assigned DMA device of a vhost device.
 Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
-device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
-enqueue operation.
+--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
+and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
+operation. The index of the device corresponds to the socket file in order,
+that means vhost device 0 is created through the first socket file, vhost
+device 1 is created through the second socket file, and so on.
 
 Common Issues
 -------------
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index bf4e033bdb..a305100b47 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -21,6 +21,8 @@ struct packet_tracker {
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
 
+int vid2socketid[MAX_VHOST_DEVICE];
+
 int
 open_ioat(const char *value)
 {
@@ -29,7 +31,7 @@ open_ioat(const char *value)
 	char *addrs = input;
 	char *ptrs[2];
 	char *start, *end, *substr;
-	int64_t vid, vring_id;
+	int64_t socketid, vring_id;
 	struct rte_ioat_rawdev_config config;
 	struct rte_rawdev_info info = { .dev_private = &config };
 	char name[32];
@@ -60,6 +62,8 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		char *txd, *rxd;
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
@@ -68,27 +72,38 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		int async_flag;
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd == NULL && rxd == NULL) {
 			ret = -1;
 			goto out;
+		} else if (txd) {
+			is_txd = true;
+			start = txd;
+			async_flag = ASYNC_RX_VHOST;
+		} else {
+			is_txd = false;
+			start = rxd;
+			async_flag = ASYNC_TX_VHOST;
 		}
 
 		start += 3;
-		vid = strtol(start, &end, 0);
+		socketid = strtol(start, &end, 0);
 		if (end == start) {
 			ret = -1;
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
-				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
+			&(dma_info + socketid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
 			goto out;
 		}
 
-		rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr,
+		rte_pci_device_name(&(dma_info + socketid)->dmas[vring_id].addr,
 				name, sizeof(name));
 		dev_id = rte_rawdev_get_dev_id(name);
 		if (dev_id == (uint16_t)(-ENODEV) ||
@@ -103,8 +118,9 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
-		(dma_info + vid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
+		(dma_info + socketid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->async_flag |= async_flag;
 		config.ring_size = IOAT_RING_SIZE;
 		config.hdls_disable = true;
 		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
@@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
 	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
 
+	if (queue_id >= MAX_RING_COUNT)
+		return -1;
+
+	uint16_t dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
+	unsigned short write = cb_tracker[dev_id].next_write;
 	if (!opaque_data) {
 		for (i_desc = 0; i_desc < count; i_desc++) {
 			src = descs[i_desc].src;
@@ -170,16 +189,16 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets)
 {
-	if (!opaque_data) {
+	if (!opaque_data && (queue_id < MAX_RING_COUNT)) {
 		uintptr_t dump[255];
 		int n_seg;
 		unsigned short read, write;
 		unsigned short nb_packet = 0;
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
+		uint16_t dev_id;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
@@ -215,4 +234,18 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 	return -1;
 }
 
+uint32_t get_async_flag_by_vid(int vid)
+{
+	return dma_bind[vid2socketid[vid]].async_flag;
+}
+
+uint32_t get_async_flag_by_socketid(int socketid)
+{
+	return dma_bind[socketid].async_flag;
+}
+
+void init_vid2socketid_array(int vid, int socketid)
+{
+	vid2socketid[vid] = socketid;
+}
 #endif /* RTE_RAW_IOAT */
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..51111d65af 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -12,6 +12,9 @@
 #define MAX_VHOST_DEVICE 1024
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
+#define MAX_RING_COUNT	2
+#define ASYNC_RX_VHOST	1
+#define ASYNC_TX_VHOST	2
 
 struct dma_info {
 	struct rte_pci_addr addr;
@@ -20,6 +23,7 @@ struct dma_info {
 };
 
 struct dma_for_vhost {
+	int async_flag;
 	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
 	uint16_t nr;
 };
@@ -36,6 +40,10 @@ uint32_t
 ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
+
+uint32_t get_async_flag_by_vid(int vid);
+uint32_t get_async_flag_by_socketid(int socketid);
+void init_vid2socketid_array(int vid, int socketid);
 #else
 static int open_ioat(const char *value __rte_unused)
 {
@@ -59,5 +67,22 @@ ioat_check_completed_copies_cb(int vid __rte_unused,
 {
 	return -1;
 }
+
+static uint32_t
+get_async_flag_by_vid(int vid __rte_unused)
+{
+	return 0;
+}
+
+static uint32_t
+get_async_flag_by_socketid(int socketid __rte_unused)
+{
+	return 0;
+}
+
+static void
+init_vid2socketid_array(int vid __rte_unused, int socketid __rte_unused)
+{
+}
 #endif
 #endif /* _IOAT_H_ */
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index aebdc3a566..81d7e4cbd3 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -93,8 +93,6 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
-
 static char *dma_type;
 
 /* Specify timeout (in useconds) between retries on RX. */
@@ -679,7 +677,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -897,7 +894,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_RX_VHOST) == 0)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1237,10 +1234,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_RX_VHOST) == 0)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+				struct rte_mempool *mbuf_pool,
+				struct rte_mbuf **pkts, uint16_t count)
+{
+	int nr_inflight;
+	return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
+			mbuf_pool, pkts, count, &nr_inflight);
+}
+
 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			struct rte_mempool *mbuf_pool,
 			struct rte_mbuf **pkts, uint16_t count)
@@ -1392,12 +1398,90 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
 
+static int
+get_socketid_by_vid(int vid)
+{
+	int i;
+	char ifname[PATH_MAX];
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
+
+	for (i = 0; i < nb_sockets; i++) {
+		char *file = socket_files + i * PATH_MAX;
+		if (strcmp(file, ifname) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int
+init_vhost_queue_ops(int vid)
+{
+	int socketid = get_socketid_by_vid(vid);
+	if (socketid == -1)
+		return -1;
+
+	init_vid2socketid_array(vid, socketid);
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						sync_enqueue_pkts;
+		}
+
+		if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST) {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						async_dequeue_pkts;
+		} else {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						sync_dequeue_pkts;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_async_channel_register(int vid)
+{
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
+
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
+
+		f.async_inorder = 1;
+		f.async_threshold = 256;
+
+		if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
+					f.intval, &channel_ops);
+		}
+		if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ,
+					f.intval, &channel_ops);
+		}
+	}
+
+	return ret;
+}
+
 /*
  * A new device is added to a data core. First the device is added to the main linked list
  * and then allocated to a specific data core.
@@ -1431,20 +1515,8 @@ new_device(int vid)
 		}
 	}
 
-	if (builtin_net_driver) {
-		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
-		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
-	} else {
-		if (async_vhost_driver) {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							async_enqueue_pkts;
-		} else {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							sync_enqueue_pkts;
-		}
-
-		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
-	}
+	if (init_vhost_queue_ops(vid) != 0)
+		return -1;
 
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
@@ -1473,28 +1545,13 @@ new_device(int vid)
 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
+	int ret = vhost_async_channel_register(vid);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
-
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			f.async_inorder = 1;
-			f.async_threshold = 256;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
-		}
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1735,10 +1792,11 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
-			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
+		uint64_t flag = flags;
+		if (get_async_flag_by_socketid(i) != 0)
+			flag |= RTE_VHOST_USER_ASYNC_COPY;
 
-		ret = rte_vhost_driver_register(file, flags);
+		ret = rte_vhost_driver_register(file, flag);
 		if (ret != 0) {
 			unregister_drivers(i);
 			rte_exit(EXIT_FAILURE,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
                   ` (2 preceding siblings ...)
  2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-06-30 19:27 ` Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
                     ` (3 more replies)
  2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
                   ` (2 subsequent siblings)
  6 siblings, 4 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-30 19:27 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.
note: PATCH v4 3/4 depends on IOMMU patch from Ding,Xuan
(http://patches.dpdk.org/project/dpdk/patch/20210603173023.10487-1-xuan.ding@intel.com/)

v4:
- Fix wrong packet index issue in async dequeue improve
  the performance of small packet copies.

v3:
- Fix compilation warning and error in arm platform.
- Restore the removed function virtio_dev_pktmbuf_alloc,
  async dequeue allocate packets in separate.

v2:
- Refactor vhost datapath as preliminary patch for this series.
- The change of using new API in examples/vhost is put into a
  dedicated patch.
- Check queue_id value before using it.
- Async dequeue performance enhancement. 160% performance improvement
  for v2 vs. v1.
- Async dequeue API name change from rte_vhost_try_dequeue_burst to
  rte_vhost_async_try_dequeue_burst.
- The completed package updates the used ring directly.

Wenwu Ma (3):
  examples/vhost: refactor vhost enqueue and dequeue datapaths.
  examples/vhost: use a new API to query remaining ring space
  examples/vhost: support vhost async dequeue data path

Yuan Wang (1):
  vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |  10 +
 doc/guides/sample_app_ug/vhost.rst  |   9 +-
 examples/vhost/ioat.c               |  67 +++-
 examples/vhost/ioat.h               |  25 ++
 examples/vhost/main.c               | 224 +++++++----
 examples/vhost/main.h               |  33 +-
 examples/vhost/virtio_net.c         |  16 +-
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 579 ++++++++++++++++++++++++++++
 10 files changed, 902 insertions(+), 108 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths.
  2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-06-30 19:27   ` Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-30 19:27 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

Previously, by judging the flag, we call different enqueue/dequeue
functions in data path.

Now, we use an ops that was initialized when Vhost was created,
so that we can call ops directly in Vhost data path without any more
flag judgment.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
 examples/vhost/main.h       |  33 +++++++++--
 examples/vhost/virtio_net.c |  16 +++++-
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..aebdc3a566 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
 static char *socket_files;
 static int nb_sockets;
 
+static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE];
+
 /* empty vmdq configuration structure. Filled in programatically */
 static struct rte_eth_conf vmdq_conf_default = {
 	.rxmode = {
@@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev)
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
 
-	if (builtin_net_driver) {
-		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[nr_xmit];
-
-		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
-
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = nr_xmit - ret;
-		if (enqueue_fail)
-			free_pkts(&m[ret], nr_xmit - ret);
-	} else {
-		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						m, nr_xmit);
-	}
+	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+					VIRTIO_RXQ, m, nr_xmit);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
@@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 }
 
+uint16_t
+async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	uint16_t enqueue_count;
+	uint32_t cpu_cpl_nr = 0;
+	uint16_t enqueue_fail = 0;
+	struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
+
+	complete_async_pkts(vdev);
+	enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
+				queue_id, pkts, rx_count,
+				m_cpu_cpl, &cpu_cpl_nr);
+	if (cpu_cpl_nr)
+		free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+	enqueue_fail = rx_count - enqueue_count;
+	if (enqueue_fail)
+		free_pkts(&pkts[enqueue_count], enqueue_fail);
+
+	return enqueue_count;
+}
+
+uint16_t
+sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count);
+}
+
 static __rte_always_inline void
 drain_eth_rx(struct vhost_dev *vdev)
 {
@@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev)
 		}
 	}
 
-	if (builtin_net_driver) {
-		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
-						pkts, rx_count);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
-
-		complete_async_pkts(vdev);
-		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count,
-					m_cpu_cpl, &cpu_cpl_nr);
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = rx_count - enqueue_count;
-		if (enqueue_fail)
-			free_pkts(&pkts[enqueue_count], enqueue_fail);
-
-	} else {
-		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
-	}
+	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+						VIRTIO_RXQ, pkts, rx_count);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
@@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count)
+{
+	return rte_vhost_dequeue_burst(dev->vid, queue_id,
+					mbuf_pool, pkts, count);
+}
+
 static __rte_always_inline void
 drain_virtio_tx(struct vhost_dev *vdev)
 {
@@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	if (builtin_net_driver) {
-		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
-					pkts, MAX_PKT_BURST);
-	} else {
-		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
-	}
+	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
+				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
@@ -1436,6 +1431,21 @@ new_device(int vid)
 		}
 	}
 
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (async_vhost_driver) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							sync_enqueue_pkts;
+		}
+
+		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
+	}
+
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
 
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 0ccdce4b4a..7cd8a11a45 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -60,6 +60,19 @@ struct vhost_dev {
 	struct vhost_queue queues[MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;
 
+typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mbuf **pkts,
+			uint32_t count);
+
+typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+
+struct vhost_queue_ops {
+	vhost_enqueue_burst_t enqueue_pkt_burst;
+	vhost_dequeue_burst_t dequeue_pkt_burst;
+};
+
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
 
 
@@ -84,9 +97,21 @@ struct lcore_info {
 void vs_vhost_net_setup(struct vhost_dev *dev);
 void vs_vhost_net_remove(struct vhost_dev *dev);
 uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+
+uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			 struct rte_mbuf **pkts, uint32_t count);
-
-uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
-			 struct rte_mempool *mbuf_pool,
-			 struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			 struct rte_mbuf **pkts, uint32_t count);
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c
index 9064fc3a82..2432a96566 100644
--- a/examples/vhost/virtio_net.c
+++ b/examples/vhost/virtio_net.c
@@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	return count;
 }
 
+uint16_t
+builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t count)
+{
+	return vs_enqueue_pkts(dev, queue_id, pkts, count);
+}
+
 static __rte_always_inline int
 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	    struct rte_mbuf *m, uint16_t desc_idx,
@@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	return 0;
 }
 
-uint16_t
+static uint16_t
 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
@@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 
 	return i;
 }
+
+uint16_t
+builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+	return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 2/4] examples/vhost: use a new API to query remaining ring space
  2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-06-30 19:27   ` Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 3/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-30 19:27 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

A new API for querying the remaining descriptor ring capacity
is available, so we use the new one instead of the old one.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/ioat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..bf4e033bdb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +112,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 3/4] vhost: support async dequeue for split ring
  2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-06-30 19:27   ` Wenwu Ma
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-30 19:27 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Yuan Wang, Wenwu Ma

From: Yuan Wang <yuanx.wang@intel.com>

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  10 +
 lib/vhost/rte_vhost_async.h         |  44 ++-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 579 ++++++++++++++++++++++++++++
 4 files changed, 633 insertions(+), 3 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..05c42c9b11 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  Try to receive packets from the guest with offloading large packets
+  to the DMA engine. Successfully dequeued packets are transfer
+  completed and returned in ``pkts``. But there may be other packets
+  that are sent from the guest but being transferred by the DMA engine,
+  called in-flight packets. This function will return in-flight packets
+  only after the DMA engine finishes transferring. The amount of
+  in-flight packets by now is returned in ``nr_inflight``.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..58019408f1 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
 };
 
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
-};
+} __rte_cache_aligned;
 
 /**
  *  dma channel feature bit definition
@@ -193,4 +201,34 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the DMA engine. Successfully dequeued packets are
+ * transfer completed, either by the CPU or the DMA engine, and they are
+ * returned in "pkts". There may be other packets that are sent from
+ * the guest but being transferred by the DMA engine, called in-flight
+ * packets. The amount of in-flight packets by now is returned in
+ * "nr_inflight". This function will return in-flight packets only after
+ * the DMA engine finishes transferring.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets by now. If error occurred, its
+ *  value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..a320f889cd 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_async_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b93482587c..71ab1cef69 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -2673,6 +2673,32 @@ virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
 	return -1;
 }
 
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint32_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL)) {
+		VHOST_LOG_DATA(ERR,
+			"Failed to allocate memory for mbuf.\n");
+		return NULL;
+	}
+
+	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
+		/* Data doesn't fit into the buffer and the host supports
+		 * only linear buffers
+		 */
+		rte_pktmbuf_free(pkt);
+		return NULL;
+	}
+
+	return pkt;
+}
+
 __rte_always_inline
 static uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
@@ -3147,3 +3173,556 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		if (cpy_len >= cpy_threshold) {
+			async_fill_vec(src_iovec + tvec_idx,
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				(size_t)cpy_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(size_t)cpy_len);
+			tvec_idx++;
+			tlen += cpy_len;
+		} else if (vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m)) {
+			rte_memcpy(rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset),
+				(void *)((uintptr_t)(buf_addr + buf_offset)),
+				cpy_len);
+		} else {
+			batch_copy[vq->batch_copy_nb_elems].dst =
+				rte_pktmbuf_mtod_offset(cur,
+					void *, mbuf_offset);
+			batch_copy[vq->batch_copy_nb_elems].src =
+				(void *)((uintptr_t)(buf_addr + buf_offset));
+			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+			vq->batch_copy_nb_elems++;
+		}
+
+		mbuf_avail  -= cpy_len;
+		mbuf_offset += cpy_len;
+		buf_avail  -= cpy_len;
+		buf_offset += cpy_len;
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put) {
+		/* write back completed descs to used ring */
+		write_back_completed_descs_split(vq, n_pkts_put);
+		/* update used ring */
+		__atomic_add_fetch(&vq->used->idx,
+				n_pkts_put, __ATOMIC_RELEASE);
+
+		vq->async_pkts_inflight_n -= n_pkts_put;
+	}
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkt == NULL)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			rte_pktmbuf_free(pkt);
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+	if (likely(vq->shadow_used_idx))
+		flush_shadow_used_ring_split(dev, vq);
+
+out:
+	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
+		nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
+					queue_id, &pkts[nr_done_pkts],
+					count - nr_done_pkts,
+					legacy_ol_flags);
+		nr_done_pkts += nr_async_cmpl_pkts;
+	}
+	if (likely(nr_done_pkts))
+		vhost_vring_call_split(dev, vq);
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 4/4] examples/vhost: support vhost async dequeue data path
  2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
                     ` (2 preceding siblings ...)
  2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-06-30 19:27   ` Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-06-30 19:27 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch is to add vhost async dequeue data-path in vhost sample.
vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |   9 +-
 examples/vhost/ioat.c              |  61 ++++++++++---
 examples/vhost/ioat.h              |  25 ++++++
 examples/vhost/main.c              | 140 ++++++++++++++++++++---------
 4 files changed, 177 insertions(+), 58 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 9afde9c7f5..63dcf181e1 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
 **--dmas**
 This parameter is used to specify the assigned DMA device of a vhost device.
 Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
-device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
-enqueue operation.
+--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
+and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
+operation. The index of the device corresponds to the socket file in order,
+that means vhost device 0 is created through the first socket file, vhost
+device 1 is created through the second socket file, and so on.
 
 Common Issues
 -------------
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index bf4e033bdb..a305100b47 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -21,6 +21,8 @@ struct packet_tracker {
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
 
+int vid2socketid[MAX_VHOST_DEVICE];
+
 int
 open_ioat(const char *value)
 {
@@ -29,7 +31,7 @@ open_ioat(const char *value)
 	char *addrs = input;
 	char *ptrs[2];
 	char *start, *end, *substr;
-	int64_t vid, vring_id;
+	int64_t socketid, vring_id;
 	struct rte_ioat_rawdev_config config;
 	struct rte_rawdev_info info = { .dev_private = &config };
 	char name[32];
@@ -60,6 +62,8 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		char *txd, *rxd;
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
@@ -68,27 +72,38 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		int async_flag;
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd == NULL && rxd == NULL) {
 			ret = -1;
 			goto out;
+		} else if (txd) {
+			is_txd = true;
+			start = txd;
+			async_flag = ASYNC_RX_VHOST;
+		} else {
+			is_txd = false;
+			start = rxd;
+			async_flag = ASYNC_TX_VHOST;
 		}
 
 		start += 3;
-		vid = strtol(start, &end, 0);
+		socketid = strtol(start, &end, 0);
 		if (end == start) {
 			ret = -1;
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
-				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
+			&(dma_info + socketid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
 			goto out;
 		}
 
-		rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr,
+		rte_pci_device_name(&(dma_info + socketid)->dmas[vring_id].addr,
 				name, sizeof(name));
 		dev_id = rte_rawdev_get_dev_id(name);
 		if (dev_id == (uint16_t)(-ENODEV) ||
@@ -103,8 +118,9 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
-		(dma_info + vid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
+		(dma_info + socketid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->async_flag |= async_flag;
 		config.ring_size = IOAT_RING_SIZE;
 		config.hdls_disable = true;
 		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
@@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
 	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
 
+	if (queue_id >= MAX_RING_COUNT)
+		return -1;
+
+	uint16_t dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
+	unsigned short write = cb_tracker[dev_id].next_write;
 	if (!opaque_data) {
 		for (i_desc = 0; i_desc < count; i_desc++) {
 			src = descs[i_desc].src;
@@ -170,16 +189,16 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets)
 {
-	if (!opaque_data) {
+	if (!opaque_data && (queue_id < MAX_RING_COUNT)) {
 		uintptr_t dump[255];
 		int n_seg;
 		unsigned short read, write;
 		unsigned short nb_packet = 0;
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
+		uint16_t dev_id;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
@@ -215,4 +234,18 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 	return -1;
 }
 
+uint32_t get_async_flag_by_vid(int vid)
+{
+	return dma_bind[vid2socketid[vid]].async_flag;
+}
+
+uint32_t get_async_flag_by_socketid(int socketid)
+{
+	return dma_bind[socketid].async_flag;
+}
+
+void init_vid2socketid_array(int vid, int socketid)
+{
+	vid2socketid[vid] = socketid;
+}
 #endif /* RTE_RAW_IOAT */
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..51111d65af 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -12,6 +12,9 @@
 #define MAX_VHOST_DEVICE 1024
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
+#define MAX_RING_COUNT	2
+#define ASYNC_RX_VHOST	1
+#define ASYNC_TX_VHOST	2
 
 struct dma_info {
 	struct rte_pci_addr addr;
@@ -20,6 +23,7 @@ struct dma_info {
 };
 
 struct dma_for_vhost {
+	int async_flag;
 	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
 	uint16_t nr;
 };
@@ -36,6 +40,10 @@ uint32_t
 ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
+
+uint32_t get_async_flag_by_vid(int vid);
+uint32_t get_async_flag_by_socketid(int socketid);
+void init_vid2socketid_array(int vid, int socketid);
 #else
 static int open_ioat(const char *value __rte_unused)
 {
@@ -59,5 +67,22 @@ ioat_check_completed_copies_cb(int vid __rte_unused,
 {
 	return -1;
 }
+
+static uint32_t
+get_async_flag_by_vid(int vid __rte_unused)
+{
+	return 0;
+}
+
+static uint32_t
+get_async_flag_by_socketid(int socketid __rte_unused)
+{
+	return 0;
+}
+
+static void
+init_vid2socketid_array(int vid __rte_unused, int socketid __rte_unused)
+{
+}
 #endif
 #endif /* _IOAT_H_ */
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index aebdc3a566..81d7e4cbd3 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -93,8 +93,6 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
-
 static char *dma_type;
 
 /* Specify timeout (in useconds) between retries on RX. */
@@ -679,7 +677,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -897,7 +894,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_RX_VHOST) == 0)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1237,10 +1234,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_RX_VHOST) == 0)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+				struct rte_mempool *mbuf_pool,
+				struct rte_mbuf **pkts, uint16_t count)
+{
+	int nr_inflight;
+	return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
+			mbuf_pool, pkts, count, &nr_inflight);
+}
+
 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			struct rte_mempool *mbuf_pool,
 			struct rte_mbuf **pkts, uint16_t count)
@@ -1392,12 +1398,90 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
 
+static int
+get_socketid_by_vid(int vid)
+{
+	int i;
+	char ifname[PATH_MAX];
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
+
+	for (i = 0; i < nb_sockets; i++) {
+		char *file = socket_files + i * PATH_MAX;
+		if (strcmp(file, ifname) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int
+init_vhost_queue_ops(int vid)
+{
+	int socketid = get_socketid_by_vid(vid);
+	if (socketid == -1)
+		return -1;
+
+	init_vid2socketid_array(vid, socketid);
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						sync_enqueue_pkts;
+		}
+
+		if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST) {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						async_dequeue_pkts;
+		} else {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						sync_dequeue_pkts;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_async_channel_register(int vid)
+{
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
+
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
+
+		f.async_inorder = 1;
+		f.async_threshold = 256;
+
+		if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
+					f.intval, &channel_ops);
+		}
+		if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ,
+					f.intval, &channel_ops);
+		}
+	}
+
+	return ret;
+}
+
 /*
  * A new device is added to a data core. First the device is added to the main linked list
  * and then allocated to a specific data core.
@@ -1431,20 +1515,8 @@ new_device(int vid)
 		}
 	}
 
-	if (builtin_net_driver) {
-		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
-		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
-	} else {
-		if (async_vhost_driver) {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							async_enqueue_pkts;
-		} else {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							sync_enqueue_pkts;
-		}
-
-		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
-	}
+	if (init_vhost_queue_ops(vid) != 0)
+		return -1;
 
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
@@ -1473,28 +1545,13 @@ new_device(int vid)
 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
+	int ret = vhost_async_channel_register(vid);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
-
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			f.async_inorder = 1;
-			f.async_threshold = 256;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
-		}
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1735,10 +1792,11 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
-			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
+		uint64_t flag = flags;
+		if (get_async_flag_by_socketid(i) != 0)
+			flag |= RTE_VHOST_USER_ASYNC_COPY;
 
-		ret = rte_vhost_driver_register(file, flags);
+		ret = rte_vhost_driver_register(file, flag);
 		if (ret != 0) {
 			unregister_drivers(i);
 			rte_exit(EXIT_FAILURE,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
                   ` (3 preceding siblings ...)
  2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-07-05 18:11 ` Wenwu Ma
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
                     ` (3 more replies)
  2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
  6 siblings, 4 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-05 18:11 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

v5:
- DMA address use IOVA instead of VA.

v4:
- Fix wrong packet index issue in async dequeue improve
  the performance of small packet copies.

v3:
- Fix compilation warning and error in arm platform.
- Restore the removed function virtio_dev_pktmbuf_alloc,
  async dequeue allocate packets in separate.

v2:
- Refactor vhost datapath as preliminary patch for this series.
- The change of using new API in examples/vhost is put into a
  dedicated patch.
- Check queue_id value before using it.
- Async dequeue performance enhancement. 160% performance improvement
  for v2 vs. v1.
- Async dequeue API name change from rte_vhost_try_dequeue_burst to
  rte_vhost_async_try_dequeue_burst.
- The completed package updates the used ring directly.

Wenwu Ma (3):
  examples/vhost: refactor vhost enqueue and dequeue datapaths.
  examples/vhost: use a new API to query remaining ring space
  examples/vhost: support vhost async dequeue data path

Yuan Wang (1):
  vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |  10 +
 doc/guides/sample_app_ug/vhost.rst  |   9 +-
 examples/vhost/ioat.c               |  67 +++-
 examples/vhost/ioat.h               |  25 ++
 examples/vhost/main.c               | 224 +++++++----
 examples/vhost/main.h               |  33 +-
 examples/vhost/virtio_net.c         |  16 +-
 lib/vhost/rte_vhost_async.h         |  44 +-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 601 ++++++++++++++++++++++++++++
 10 files changed, 924 insertions(+), 108 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths.
  2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-07-05 18:11   ` Wenwu Ma
  2021-07-13 13:34     ` Maxime Coquelin
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 50+ messages in thread
From: Wenwu Ma @ 2021-07-05 18:11 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

Previously, by judging the flag, we call different enqueue/dequeue
functions in data path.

Now, we use an ops that was initialized when Vhost was created,
so that we can call ops directly in Vhost data path without any more
flag judgment.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
 examples/vhost/main.h       |  33 +++++++++--
 examples/vhost/virtio_net.c |  16 +++++-
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..aebdc3a566 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
 static char *socket_files;
 static int nb_sockets;
 
+static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE];
+
 /* empty vmdq configuration structure. Filled in programatically */
 static struct rte_eth_conf vmdq_conf_default = {
 	.rxmode = {
@@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev)
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
 
-	if (builtin_net_driver) {
-		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[nr_xmit];
-
-		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
-
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = nr_xmit - ret;
-		if (enqueue_fail)
-			free_pkts(&m[ret], nr_xmit - ret);
-	} else {
-		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						m, nr_xmit);
-	}
+	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+					VIRTIO_RXQ, m, nr_xmit);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
@@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 }
 
+uint16_t
+async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	uint16_t enqueue_count;
+	uint32_t cpu_cpl_nr = 0;
+	uint16_t enqueue_fail = 0;
+	struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
+
+	complete_async_pkts(vdev);
+	enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
+				queue_id, pkts, rx_count,
+				m_cpu_cpl, &cpu_cpl_nr);
+	if (cpu_cpl_nr)
+		free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+	enqueue_fail = rx_count - enqueue_count;
+	if (enqueue_fail)
+		free_pkts(&pkts[enqueue_count], enqueue_fail);
+
+	return enqueue_count;
+}
+
+uint16_t
+sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count);
+}
+
 static __rte_always_inline void
 drain_eth_rx(struct vhost_dev *vdev)
 {
@@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev)
 		}
 	}
 
-	if (builtin_net_driver) {
-		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
-						pkts, rx_count);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
-
-		complete_async_pkts(vdev);
-		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count,
-					m_cpu_cpl, &cpu_cpl_nr);
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = rx_count - enqueue_count;
-		if (enqueue_fail)
-			free_pkts(&pkts[enqueue_count], enqueue_fail);
-
-	} else {
-		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
-	}
+	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+						VIRTIO_RXQ, pkts, rx_count);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
@@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count)
+{
+	return rte_vhost_dequeue_burst(dev->vid, queue_id,
+					mbuf_pool, pkts, count);
+}
+
 static __rte_always_inline void
 drain_virtio_tx(struct vhost_dev *vdev)
 {
@@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	if (builtin_net_driver) {
-		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
-					pkts, MAX_PKT_BURST);
-	} else {
-		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
-	}
+	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
+				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
@@ -1436,6 +1431,21 @@ new_device(int vid)
 		}
 	}
 
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (async_vhost_driver) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							sync_enqueue_pkts;
+		}
+
+		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
+	}
+
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
 
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 0ccdce4b4a..7cd8a11a45 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -60,6 +60,19 @@ struct vhost_dev {
 	struct vhost_queue queues[MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;
 
+typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mbuf **pkts,
+			uint32_t count);
+
+typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+
+struct vhost_queue_ops {
+	vhost_enqueue_burst_t enqueue_pkt_burst;
+	vhost_dequeue_burst_t dequeue_pkt_burst;
+};
+
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
 
 
@@ -84,9 +97,21 @@ struct lcore_info {
 void vs_vhost_net_setup(struct vhost_dev *dev);
 void vs_vhost_net_remove(struct vhost_dev *dev);
 uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+
+uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			 struct rte_mbuf **pkts, uint32_t count);
-
-uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
-			 struct rte_mempool *mbuf_pool,
-			 struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			 struct rte_mbuf **pkts, uint32_t count);
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c
index 9064fc3a82..2432a96566 100644
--- a/examples/vhost/virtio_net.c
+++ b/examples/vhost/virtio_net.c
@@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	return count;
 }
 
+uint16_t
+builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t count)
+{
+	return vs_enqueue_pkts(dev, queue_id, pkts, count);
+}
+
 static __rte_always_inline int
 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	    struct rte_mbuf *m, uint16_t desc_idx,
@@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	return 0;
 }
 
-uint16_t
+static uint16_t
 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
@@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 
 	return i;
 }
+
+uint16_t
+builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+	return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v5 2/4] examples/vhost: use a new API to query remaining ring space
  2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-07-05 18:11   ` Wenwu Ma
  2021-07-13 13:36     ` Maxime Coquelin
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 1 reply; 50+ messages in thread
From: Wenwu Ma @ 2021-07-05 18:11 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

A new API for querying the remaining descriptor ring capacity
is available, so we use the new one instead of the old one.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/ioat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..bf4e033bdb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +112,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-07-05 18:11   ` Wenwu Ma
  2021-07-13 14:30     ` Maxime Coquelin
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 1 reply; 50+ messages in thread
From: Wenwu Ma @ 2021-07-05 18:11 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Yuan Wang, Wenwu Ma

From: Yuan Wang <yuanx.wang@intel.com>

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the DMA engine, thus saving precious CPU
cycles.

Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  10 +
 lib/vhost/rte_vhost_async.h         |  44 +-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 601 ++++++++++++++++++++++++++++
 4 files changed, 655 insertions(+), 3 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..05c42c9b11 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  Try to receive packets from the guest with offloading large packets
+  to the DMA engine. Successfully dequeued packets are transfer
+  completed and returned in ``pkts``. But there may be other packets
+  that are sent from the guest but being transferred by the DMA engine,
+  called in-flight packets. This function will return in-flight packets
+  only after the DMA engine finishes transferring. The amount of
+  in-flight packets by now is returned in ``nr_inflight``.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..58019408f1 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
 };
 
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
-};
+} __rte_cache_aligned;
 
 /**
  *  dma channel feature bit definition
@@ -193,4 +201,34 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the DMA engine. Successfully dequeued packets are
+ * transfer completed, either by the CPU or the DMA engine, and they are
+ * returned in "pkts". There may be other packets that are sent from
+ * the guest but being transferred by the DMA engine, called in-flight
+ * packets. The amount of in-flight packets by now is returned in
+ * "nr_inflight". This function will return in-flight packets only after
+ * the DMA engine finishes transferring.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets by now. If error occurred, its
+ *  value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..a320f889cd 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_async_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b93482587c..52237e8600 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -2673,6 +2673,32 @@ virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
 	return -1;
 }
 
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint32_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL)) {
+		VHOST_LOG_DATA(ERR,
+			"Failed to allocate memory for mbuf.\n");
+		return NULL;
+	}
+
+	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
+		/* Data doesn't fit into the buffer and the host supports
+		 * only linear buffers
+		 */
+		rte_pktmbuf_free(pkt);
+		return NULL;
+	}
+
+	return pkt;
+}
+
 __rte_always_inline
 static uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
@@ -3147,3 +3173,578 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr, buf_iova;
+	uint64_t mapped_len;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+	buf_iova = buf_vec[vec_idx].buf_iova;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		while (cpy_len && cpy_len >= cpy_threshold) {
+			void *hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+						buf_iova + buf_offset, cpy_len,
+						&mapped_len);
+
+			if (unlikely(!hpa || mapped_len < cpy_threshold))
+				break;
+
+			async_fill_vec(src_iovec + tvec_idx, hpa,
+				(size_t)mapped_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				(void *)(uintptr_t)rte_pktmbuf_iova_offset(cur,
+							mbuf_offset),
+				(size_t)mapped_len);
+
+			tvec_idx++;
+			tlen += (uint32_t)mapped_len;
+			cpy_len -= (uint32_t)mapped_len;
+			mbuf_avail -= (uint32_t)mapped_len;
+			mbuf_offset += (uint32_t)mapped_len;
+			buf_avail -= (uint32_t)mapped_len;
+			buf_offset += (uint32_t)mapped_len;
+		}
+
+		if (cpy_len) {
+			if (vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m)) {
+				rte_memcpy(
+					rte_pktmbuf_mtod_offset(cur, void *,
+							mbuf_offset),
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset)),
+					cpy_len);
+			} else {
+				batch_copy[vq->batch_copy_nb_elems].dst =
+					rte_pktmbuf_mtod_offset(cur, void *,
+							mbuf_offset);
+				batch_copy[vq->batch_copy_nb_elems].src =
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset));
+				batch_copy[vq->batch_copy_nb_elems].len =
+					cpy_len;
+				vq->batch_copy_nb_elems++;
+			}
+
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail -= cpy_len;
+			buf_offset += cpy_len;
+		}
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put) {
+		/* write back completed descs to used ring */
+		write_back_completed_descs_split(vq, n_pkts_put);
+		/* update used ring */
+		__atomic_add_fetch(&vq->used->idx,
+				n_pkts_put, __ATOMIC_RELEASE);
+
+		vq->async_pkts_inflight_n -= n_pkts_put;
+	}
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkt == NULL)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			rte_pktmbuf_free(pkt);
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+	if (likely(vq->shadow_used_idx))
+		flush_shadow_used_ring_split(dev, vq);
+
+out:
+	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
+		nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
+					queue_id, &pkts[nr_done_pkts],
+					count - nr_done_pkts,
+					legacy_ol_flags);
+		nr_done_pkts += nr_async_cmpl_pkts;
+	}
+	if (likely(nr_done_pkts))
+		vhost_vring_call_split(dev, vq);
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v5 4/4] examples/vhost: support vhost async dequeue data path
  2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
                     ` (2 preceding siblings ...)
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-07-05 18:11   ` Wenwu Ma
  2021-07-13 17:01     ` Maxime Coquelin
  3 siblings, 1 reply; 50+ messages in thread
From: Wenwu Ma @ 2021-07-05 18:11 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch is to add vhost async dequeue data-path in vhost sample.
vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |   9 +-
 examples/vhost/ioat.c              |  61 ++++++++++---
 examples/vhost/ioat.h              |  25 ++++++
 examples/vhost/main.c              | 140 ++++++++++++++++++++---------
 4 files changed, 177 insertions(+), 58 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 9afde9c7f5..63dcf181e1 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
 **--dmas**
 This parameter is used to specify the assigned DMA device of a vhost device.
 Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
-device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
-enqueue operation.
+--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
+and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
+operation. The index of the device corresponds to the socket file in order,
+that means vhost device 0 is created through the first socket file, vhost
+device 1 is created through the second socket file, and so on.
 
 Common Issues
 -------------
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index bf4e033bdb..a305100b47 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -21,6 +21,8 @@ struct packet_tracker {
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
 
+int vid2socketid[MAX_VHOST_DEVICE];
+
 int
 open_ioat(const char *value)
 {
@@ -29,7 +31,7 @@ open_ioat(const char *value)
 	char *addrs = input;
 	char *ptrs[2];
 	char *start, *end, *substr;
-	int64_t vid, vring_id;
+	int64_t socketid, vring_id;
 	struct rte_ioat_rawdev_config config;
 	struct rte_rawdev_info info = { .dev_private = &config };
 	char name[32];
@@ -60,6 +62,8 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		char *txd, *rxd;
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
@@ -68,27 +72,38 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		int async_flag;
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd == NULL && rxd == NULL) {
 			ret = -1;
 			goto out;
+		} else if (txd) {
+			is_txd = true;
+			start = txd;
+			async_flag = ASYNC_RX_VHOST;
+		} else {
+			is_txd = false;
+			start = rxd;
+			async_flag = ASYNC_TX_VHOST;
 		}
 
 		start += 3;
-		vid = strtol(start, &end, 0);
+		socketid = strtol(start, &end, 0);
 		if (end == start) {
 			ret = -1;
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
-				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
+			&(dma_info + socketid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
 			goto out;
 		}
 
-		rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr,
+		rte_pci_device_name(&(dma_info + socketid)->dmas[vring_id].addr,
 				name, sizeof(name));
 		dev_id = rte_rawdev_get_dev_id(name);
 		if (dev_id == (uint16_t)(-ENODEV) ||
@@ -103,8 +118,9 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
-		(dma_info + vid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
+		(dma_info + socketid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->async_flag |= async_flag;
 		config.ring_size = IOAT_RING_SIZE;
 		config.hdls_disable = true;
 		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
@@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
 	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
 
+	if (queue_id >= MAX_RING_COUNT)
+		return -1;
+
+	uint16_t dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
+	unsigned short write = cb_tracker[dev_id].next_write;
 	if (!opaque_data) {
 		for (i_desc = 0; i_desc < count; i_desc++) {
 			src = descs[i_desc].src;
@@ -170,16 +189,16 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets)
 {
-	if (!opaque_data) {
+	if (!opaque_data && (queue_id < MAX_RING_COUNT)) {
 		uintptr_t dump[255];
 		int n_seg;
 		unsigned short read, write;
 		unsigned short nb_packet = 0;
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
+		uint16_t dev_id;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
@@ -215,4 +234,18 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 	return -1;
 }
 
+uint32_t get_async_flag_by_vid(int vid)
+{
+	return dma_bind[vid2socketid[vid]].async_flag;
+}
+
+uint32_t get_async_flag_by_socketid(int socketid)
+{
+	return dma_bind[socketid].async_flag;
+}
+
+void init_vid2socketid_array(int vid, int socketid)
+{
+	vid2socketid[vid] = socketid;
+}
 #endif /* RTE_RAW_IOAT */
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..51111d65af 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -12,6 +12,9 @@
 #define MAX_VHOST_DEVICE 1024
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
+#define MAX_RING_COUNT	2
+#define ASYNC_RX_VHOST	1
+#define ASYNC_TX_VHOST	2
 
 struct dma_info {
 	struct rte_pci_addr addr;
@@ -20,6 +23,7 @@ struct dma_info {
 };
 
 struct dma_for_vhost {
+	int async_flag;
 	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
 	uint16_t nr;
 };
@@ -36,6 +40,10 @@ uint32_t
 ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
+
+uint32_t get_async_flag_by_vid(int vid);
+uint32_t get_async_flag_by_socketid(int socketid);
+void init_vid2socketid_array(int vid, int socketid);
 #else
 static int open_ioat(const char *value __rte_unused)
 {
@@ -59,5 +67,22 @@ ioat_check_completed_copies_cb(int vid __rte_unused,
 {
 	return -1;
 }
+
+static uint32_t
+get_async_flag_by_vid(int vid __rte_unused)
+{
+	return 0;
+}
+
+static uint32_t
+get_async_flag_by_socketid(int socketid __rte_unused)
+{
+	return 0;
+}
+
+static void
+init_vid2socketid_array(int vid __rte_unused, int socketid __rte_unused)
+{
+}
 #endif
 #endif /* _IOAT_H_ */
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index aebdc3a566..81d7e4cbd3 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -93,8 +93,6 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
-
 static char *dma_type;
 
 /* Specify timeout (in useconds) between retries on RX. */
@@ -679,7 +677,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -897,7 +894,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_RX_VHOST) == 0)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1237,10 +1234,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_RX_VHOST) == 0)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+				struct rte_mempool *mbuf_pool,
+				struct rte_mbuf **pkts, uint16_t count)
+{
+	int nr_inflight;
+	return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
+			mbuf_pool, pkts, count, &nr_inflight);
+}
+
 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			struct rte_mempool *mbuf_pool,
 			struct rte_mbuf **pkts, uint16_t count)
@@ -1392,12 +1398,90 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
 
+static int
+get_socketid_by_vid(int vid)
+{
+	int i;
+	char ifname[PATH_MAX];
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
+
+	for (i = 0; i < nb_sockets; i++) {
+		char *file = socket_files + i * PATH_MAX;
+		if (strcmp(file, ifname) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int
+init_vhost_queue_ops(int vid)
+{
+	int socketid = get_socketid_by_vid(vid);
+	if (socketid == -1)
+		return -1;
+
+	init_vid2socketid_array(vid, socketid);
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						sync_enqueue_pkts;
+		}
+
+		if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST) {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						async_dequeue_pkts;
+		} else {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						sync_dequeue_pkts;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_async_channel_register(int vid)
+{
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
+
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
+
+		f.async_inorder = 1;
+		f.async_threshold = 256;
+
+		if (get_async_flag_by_vid(vid) & ASYNC_RX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
+					f.intval, &channel_ops);
+		}
+		if (get_async_flag_by_vid(vid) & ASYNC_TX_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ,
+					f.intval, &channel_ops);
+		}
+	}
+
+	return ret;
+}
+
 /*
  * A new device is added to a data core. First the device is added to the main linked list
  * and then allocated to a specific data core.
@@ -1431,20 +1515,8 @@ new_device(int vid)
 		}
 	}
 
-	if (builtin_net_driver) {
-		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
-		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
-	} else {
-		if (async_vhost_driver) {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							async_enqueue_pkts;
-		} else {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							sync_enqueue_pkts;
-		}
-
-		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
-	}
+	if (init_vhost_queue_ops(vid) != 0)
+		return -1;
 
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
@@ -1473,28 +1545,13 @@ new_device(int vid)
 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
+	int ret = vhost_async_channel_register(vid);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
-
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			f.async_inorder = 1;
-			f.async_threshold = 256;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
-		}
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1735,10 +1792,11 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
-			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
+		uint64_t flag = flags;
+		if (get_async_flag_by_socketid(i) != 0)
+			flag |= RTE_VHOST_USER_ASYNC_COPY;
 
-		ret = rte_vhost_driver_register(file, flags);
+		ret = rte_vhost_driver_register(file, flag);
 		if (ret != 0) {
 			unregister_drivers(i);
 			rte_exit(EXIT_FAILURE,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths.
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-07-13 13:34     ` Maxime Coquelin
  0 siblings, 0 replies; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-13 13:34 UTC (permalink / raw)
  To: Wenwu Ma, dev; +Cc: chenbo.xia, cheng1.jiang, jiayu.hu

Hi Wenwu,

Please do not add dot in the commit title.

On 7/5/21 8:11 PM, Wenwu Ma wrote:
> Previously, by judging the flag, we call different enqueue/dequeue
> functions in data path.
> 
> Now, we use an ops that was initialized when Vhost was created,
> so that we can call ops directly in Vhost data path without any more
> flag judgment.
> 
> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> ---
>  examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
>  examples/vhost/main.h       |  33 +++++++++--
>  examples/vhost/virtio_net.c |  16 +++++-
>  3 files changed, 105 insertions(+), 56 deletions(-)
> 

Other than that, it looks good to me.
The typo in the title can be fixed while applying:

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] examples/vhost: use a new API to query remaining ring space
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-07-13 13:36     ` Maxime Coquelin
  0 siblings, 0 replies; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-13 13:36 UTC (permalink / raw)
  To: Wenwu Ma, dev; +Cc: chenbo.xia, cheng1.jiang, jiayu.hu



On 7/5/21 8:11 PM, Wenwu Ma wrote:
> A new API for querying the remaining descriptor ring capacity
> is available, so we use the new one instead of the old one.
> 
> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> ---
>  examples/vhost/ioat.c | 6 +-----
>  1 file changed, 1 insertion(+), 5 deletions(-)
> 
> diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
> index 2a2c2d7202..bf4e033bdb 100644
> --- a/examples/vhost/ioat.c
> +++ b/examples/vhost/ioat.c
> @@ -17,7 +17,6 @@ struct packet_tracker {
>  	unsigned short next_read;
>  	unsigned short next_write;
>  	unsigned short last_remain;
> -	unsigned short ioat_space;
>  };
>  
>  struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
> @@ -113,7 +112,6 @@ open_ioat(const char *value)
>  			goto out;
>  		}
>  		rte_rawdev_start(dev_id);
> -		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
>  		dma_info->nr++;
>  		i++;
>  	}
> @@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>  			src = descs[i_desc].src;
>  			dst = descs[i_desc].dst;
>  			i_seg = 0;
> -			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
> +			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
>  				break;
>  			while (i_seg < src->nr_segs) {
>  				rte_ioat_enqueue_copy(dev_id,
> @@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>  			}
>  			write &= mask;
>  			cb_tracker[dev_id].size_track[write] = src->nr_segs;
> -			cb_tracker[dev_id].ioat_space -= src->nr_segs;
>  			write++;
>  		}
>  	} else {
> @@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
>  		if (n_seg == 0)
>  			return 0;
>  
> -		cb_tracker[dev_id].ioat_space += n_seg;
>  		n_seg += cb_tracker[dev_id].last_remain;
>  
>  		read = cb_tracker[dev_id].next_read;
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-07-13 14:30     ` Maxime Coquelin
  2021-07-14  6:50       ` Hu, Jiayu
  0 siblings, 1 reply; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-13 14:30 UTC (permalink / raw)
  To: Wenwu Ma, dev; +Cc: chenbo.xia, cheng1.jiang, jiayu.hu, Yuan Wang



On 7/5/21 8:11 PM, Wenwu Ma wrote:
> From: Yuan Wang <yuanx.wang@intel.com>
> 
> This patch implements asynchronous dequeue data path for split ring.
> A new asynchronous dequeue function is introduced. With this function,
> the application can try to receive packets from the guest with
> offloading large copies to the DMA engine, thus saving precious CPU
> cycles.
> 
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  10 +
>  lib/vhost/rte_vhost_async.h         |  44 +-
>  lib/vhost/version.map               |   3 +
>  lib/vhost/virtio_net.c              | 601 ++++++++++++++++++++++++++++
>  4 files changed, 655 insertions(+), 3 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index d18fb98910..05c42c9b11 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
>    Poll enqueue completion status from async data path. Completed packets
>    are returned to applications through ``pkts``.
>  
> +* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
> +
> +  Try to receive packets from the guest with offloading large packets
> +  to the DMA engine. Successfully dequeued packets are transfer
> +  completed and returned in ``pkts``. But there may be other packets
> +  that are sent from the guest but being transferred by the DMA engine,
> +  called in-flight packets. This function will return in-flight packets
> +  only after the DMA engine finishes transferring. The amount of
> +  in-flight packets by now is returned in ``nr_inflight``.
> +
>  Vhost-user Implementations
>  --------------------------
>  
> diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
> index 6faa31f5ad..58019408f1 100644
> --- a/lib/vhost/rte_vhost_async.h
> +++ b/lib/vhost/rte_vhost_async.h
> @@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
>  };
>  
>  /**
> - * inflight async packet information
> + * in-flight async packet information
>   */
> +struct async_nethdr {

Please prefix with rte_vhost_

> +	struct virtio_net_hdr hdr;
> +	bool valid;
> +};
> +
>  struct async_inflight_info {
>  	struct rte_mbuf *mbuf;
> -	uint16_t descs; /* num of descs inflight */
> +	union {
> +		uint16_t descs; /* num of descs in-flight */
> +		struct async_nethdr nethdr;
> +	};
>  	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> -};
> +} __rte_cache_aligned;

Does it really need to be cache aligned?

>  
>  /**
>   *  dma channel feature bit definition
> @@ -193,4 +201,34 @@ __rte_experimental
>  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  		struct rte_mbuf **pkts, uint16_t count);
>  
> +/**
> + * This function tries to receive packets from the guest with offloading
> + * large copies to the DMA engine. Successfully dequeued packets are
> + * transfer completed, either by the CPU or the DMA engine, and they are
> + * returned in "pkts". There may be other packets that are sent from
> + * the guest but being transferred by the DMA engine, called in-flight
> + * packets. The amount of in-flight packets by now is returned in
> + * "nr_inflight". This function will return in-flight packets only after
> + * the DMA engine finishes transferring.

I am not sure to understand that comment. Is it still "in-flight" if the
DMA transfer is completed?

Are we ensuring packets are not reordered with this way of working?

> + *
> + * @param vid
> + *  id of vhost device to dequeue data
> + * @param queue_id
> + *  queue id to dequeue data
> + * @param pkts
> + *  blank array to keep successfully dequeued packets
> + * @param count
> + *  size of the packet array
> + * @param nr_inflight
> + *  the amount of in-flight packets by now. If error occurred, its
> + *  value is set to -1.
> + * @return
> + *  num of successfully dequeued packets
> + */
> +__rte_experimental
> +uint16_t
> +rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
> +	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
> +	int *nr_inflight);
> +
>  #endif /* _RTE_VHOST_ASYNC_H_ */
> diff --git a/lib/vhost/version.map b/lib/vhost/version.map
> index 9103a23cd4..a320f889cd 100644
> --- a/lib/vhost/version.map
> +++ b/lib/vhost/version.map
> @@ -79,4 +79,7 @@ EXPERIMENTAL {
>  
>  	# added in 21.05
>  	rte_vhost_get_negotiated_protocol_features;
> +
> +	# added in 21.08
> +	rte_vhost_async_try_dequeue_burst;
>  };
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index b93482587c..52237e8600 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -2673,6 +2673,32 @@ virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
>  	return -1;
>  }
>  
> +/*
> + * Allocate a host supported pktmbuf.
> + */
> +static __rte_always_inline struct rte_mbuf *
> +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
> +			 uint32_t data_len)
> +{
> +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> +
> +	if (unlikely(pkt == NULL)) {
> +		VHOST_LOG_DATA(ERR,
> +			"Failed to allocate memory for mbuf.\n");
> +		return NULL;
> +	}
> +
> +	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
> +		/* Data doesn't fit into the buffer and the host supports
> +		 * only linear buffers
> +		 */
> +		rte_pktmbuf_free(pkt);
> +		return NULL;
> +	}
> +
> +	return pkt;
> +}
> +

I think you should be able to use rte_pktmbuf_alloc_bulk and
virtio_dev_pktmbuf_prep instead of re-introducing the function that was
removed by Balazs. It should help perf a bit.

>  __rte_always_inline
>  static uint16_t
>  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
> @@ -3147,3 +3173,578 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  
>  	return count;
>  }
> +
> +static __rte_always_inline int
> +async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		  struct buf_vector *buf_vec, uint16_t nr_vec,
> +		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
> +		  struct iovec *src_iovec, struct iovec *dst_iovec,
> +		  struct rte_vhost_iov_iter *src_it,
> +		  struct rte_vhost_iov_iter *dst_it,
> +		  struct async_nethdr *nethdr,
> +		  bool legacy_ol_flags)
> +{
> +	uint64_t buf_addr, buf_iova;
> +	uint64_t mapped_len;
> +	uint32_t tlen = 0;
> +	uint32_t buf_avail, buf_offset, buf_len;
> +	uint32_t mbuf_avail, mbuf_offset;
> +	uint32_t cpy_len, cpy_threshold;
> +	/* A counter to avoid desc dead loop chain */
> +	uint16_t vec_idx = 0;
> +	int tvec_idx = 0;
> +	struct rte_mbuf *cur = m, *prev = m;
> +	struct virtio_net_hdr tmp_hdr;
> +	struct virtio_net_hdr *hdr = NULL;
> +	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> +
> +	buf_addr = buf_vec[vec_idx].buf_addr;
> +	buf_len = buf_vec[vec_idx].buf_len;
> +	buf_iova = buf_vec[vec_idx].buf_iova;
> +
> +	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
> +		return -1;
> +
> +	cpy_threshold = vq->async_threshold;
> +
> +	if (virtio_net_with_host_offload(dev)) {
> +		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
> +			/*
> +			 * No luck, the virtio-net header doesn't fit
> +			 * in a contiguous virtual area.
> +			 */
> +			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
> +			hdr = &tmp_hdr;
> +		} else {
> +			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
> +		}
> +	}
> +
> +	/*
> +	 * A virtio driver normally uses at least 2 desc buffers
> +	 * for Tx: the first for storing the header, and others
> +	 * for storing the data.
> +	 */
> +	if (unlikely(buf_len < dev->vhost_hlen)) {
> +		buf_offset = dev->vhost_hlen - buf_len;
> +		vec_idx++;
> +		buf_addr = buf_vec[vec_idx].buf_addr;
> +		buf_len = buf_vec[vec_idx].buf_len;
> +		buf_avail  = buf_len - buf_offset;
> +	} else if (buf_len == dev->vhost_hlen) {
> +		if (unlikely(++vec_idx >= nr_vec))
> +			return -1;
> +		buf_addr = buf_vec[vec_idx].buf_addr;
> +		buf_len = buf_vec[vec_idx].buf_len;
> +
> +		buf_offset = 0;
> +		buf_avail = buf_len;
> +	} else {
> +		buf_offset = dev->vhost_hlen;
> +		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
> +	}
> +
> +	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
> +			(uint32_t)buf_avail, 0);
> +
> +	mbuf_offset = 0;
> +	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
> +	while (1) {
> +		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> +
> +		while (cpy_len && cpy_len >= cpy_threshold) {
> +			void *hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> +						buf_iova + buf_offset, cpy_len,
> +						&mapped_len);
> +
> +			if (unlikely(!hpa || mapped_len < cpy_threshold))
> +				break;
> +
> +			async_fill_vec(src_iovec + tvec_idx, hpa,
> +				(size_t)mapped_len);
> +			async_fill_vec(dst_iovec + tvec_idx,
> +				(void *)(uintptr_t)rte_pktmbuf_iova_offset(cur,
> +							mbuf_offset),
> +				(size_t)mapped_len);
> +
> +			tvec_idx++;
> +			tlen += (uint32_t)mapped_len;
> +			cpy_len -= (uint32_t)mapped_len;
> +			mbuf_avail -= (uint32_t)mapped_len;
> +			mbuf_offset += (uint32_t)mapped_len;
> +			buf_avail -= (uint32_t)mapped_len;
> +			buf_offset += (uint32_t)mapped_len;
> +		}
> +
> +		if (cpy_len) {
> +			if (vq->batch_copy_nb_elems >= vq->size ||
> +				(hdr && cur == m)) {
> +				rte_memcpy(
> +					rte_pktmbuf_mtod_offset(cur, void *,
> +							mbuf_offset),
> +					(void *)((uintptr_t)(buf_addr +
> +								buf_offset)),
> +					cpy_len);
> +			} else {
> +				batch_copy[vq->batch_copy_nb_elems].dst =
> +					rte_pktmbuf_mtod_offset(cur, void *,
> +							mbuf_offset);
> +				batch_copy[vq->batch_copy_nb_elems].src =
> +					(void *)((uintptr_t)(buf_addr +
> +								buf_offset));
> +				batch_copy[vq->batch_copy_nb_elems].len =
> +					cpy_len;
> +				vq->batch_copy_nb_elems++;
> +			}
> +
> +			mbuf_avail  -= cpy_len;
> +			mbuf_offset += cpy_len;
> +			buf_avail -= cpy_len;
> +			buf_offset += cpy_len;
> +		}
> +
> +		/* This buf reaches to its end, get the next one */
> +		if (buf_avail == 0) {
> +			if (++vec_idx >= nr_vec)
> +				break;
> +
> +			buf_addr = buf_vec[vec_idx].buf_addr;
> +			buf_len = buf_vec[vec_idx].buf_len;
> +
> +			buf_offset = 0;
> +			buf_avail = buf_len;
> +
> +			PRINT_PACKET(dev, (uintptr_t)buf_addr,
> +					(uint32_t)buf_avail, 0);
> +		}
> +
> +		/*
> +		 * This mbuf reaches to its end, get a new one
> +		 * to hold more data.
> +		 */
> +		if (mbuf_avail == 0) {
> +			cur = rte_pktmbuf_alloc(mbuf_pool);
> +			if (unlikely(cur == NULL)) {
> +				VHOST_LOG_DATA(ERR, "Failed to "
> +					"allocate memory for mbuf.\n");
> +				return -1;
> +			}
> +
> +			prev->next = cur;
> +			prev->data_len = mbuf_offset;
> +			m->nb_segs += 1;
> +			m->pkt_len += mbuf_offset;
> +			prev = cur;
> +
> +			mbuf_offset = 0;
> +			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> +		}
> +	}
> +
> +	prev->data_len = mbuf_offset;
> +	m->pkt_len += mbuf_offset;
> +
> +	if (hdr && tlen) {
> +		nethdr->valid = true;
> +		nethdr->hdr = *hdr;
> +	} else if (hdr)
> +		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
> +
> +	if (tlen) {
> +		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
> +		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
> +	} else
> +		src_it->count = 0;
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline uint16_t
> +async_poll_dequeue_completed_split(struct virtio_net *dev,
> +		struct vhost_virtqueue *vq, uint16_t queue_id,
> +		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
> +{
> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
> +	uint16_t start_idx, pkt_idx, from;
> +	struct async_inflight_info *pkts_info;
> +
> +	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
> +	pkts_info = vq->async_pkts_info;
> +	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
> +			vq->async_pkts_inflight_n);
> +
> +	if (count > vq->async_last_pkts_n) {
> +		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
> +			queue_id, 0, count - vq->async_last_pkts_n);
> +	}
> +
> +	n_pkts_cpl += vq->async_last_pkts_n;
> +	if (unlikely(n_pkts_cpl == 0))
> +		return 0;
> +
> +	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
> +
> +	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
> +		from = (start_idx + pkt_idx) & (vq->size - 1);
> +		pkts[pkt_idx] = pkts_info[from].mbuf;
> +
> +		if (pkts_info[from].nethdr.valid) {
> +			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
> +					pkts[pkt_idx], legacy_ol_flags);
> +		}
> +	}
> +	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
> +
> +	if (n_pkts_put) {
> +		/* write back completed descs to used ring */
> +		write_back_completed_descs_split(vq, n_pkts_put);
> +		/* update used ring */
> +		__atomic_add_fetch(&vq->used->idx,
> +				n_pkts_put, __ATOMIC_RELEASE);
> +
> +		vq->async_pkts_inflight_n -= n_pkts_put;
> +	}
> +
> +	return n_pkts_put;
> +}
> +
> +static __rte_always_inline uint16_t
> +virtio_dev_tx_async_split(struct virtio_net *dev,
> +		struct vhost_virtqueue *vq, uint16_t queue_id,
> +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
> +		uint16_t count, bool legacy_ol_flags)
> +{
> +	static bool allocerr_warned;
> +	uint16_t pkt_idx;
> +	uint16_t free_entries;
> +	uint16_t slot_idx = 0;
> +	uint16_t segs_await = 0;
> +	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0;
> +	uint16_t nr_async_burst = 0;
> +	uint16_t pkt_err = 0;
> +	uint16_t iovec_idx = 0, it_idx = 0;
> +
> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> +	struct iovec *vec_pool = vq->vec_pool;
> +	struct iovec *src_iovec = vec_pool;
> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> +
> +	struct async_pkt_index {
> +		uint16_t last_avail_idx;
> +	} async_pkts_log[MAX_PKT_BURST];
> +
> +	/**
> +	 * The ordering between avail index and
> +	 * desc reads needs to be enforced.
> +	 */
> +	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
> +			vq->last_avail_idx;
> +	if (free_entries == 0)
> +		goto out;
> +
> +	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
> +
> +	count = RTE_MIN(count, MAX_PKT_BURST);
> +	count = RTE_MIN(count, free_entries);
> +	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
> +			dev->vid, count);
> +
> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> +		uint16_t head_idx = 0;
> +		uint16_t nr_vec = 0;
> +		uint32_t buf_len;
> +		int err;
> +		struct buf_vector buf_vec[BUF_VECTOR_MAX];
> +		struct rte_mbuf *pkt;
> +
> +		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
> +						&nr_vec, buf_vec,
> +						&head_idx, &buf_len,
> +						VHOST_ACCESS_RO) < 0))
> +			break;
> +
> +		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
> +		if (unlikely(pkt == NULL)) {
> +			/**
> +			 * mbuf allocation fails for jumbo packets when external
> +			 * buffer allocation is not allowed and linear buffer
> +			 * is required. Drop this packet.
> +			 */
> +			if (!allocerr_warned) {
> +				VHOST_LOG_DATA(ERR,
> +					"Failed mbuf alloc of size %d from %s on %s.\n",
> +					buf_len, mbuf_pool->name, dev->ifname);
> +				allocerr_warned = true;
> +			}
> +			break;
> +		}
> +
> +		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
> +				(vq->size - 1);
> +		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
> +				mbuf_pool, &src_iovec[iovec_idx],
> +				&dst_iovec[iovec_idx], &it_pool[it_idx],
> +				&it_pool[it_idx + 1],
> +				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
> +		if (unlikely(err)) {
> +			rte_pktmbuf_free(pkt);
> +			if (!allocerr_warned) {
> +				VHOST_LOG_DATA(ERR,
> +					"Failed to copy desc to mbuf on %s.\n",
> +					dev->ifname);
> +				allocerr_warned = true;
> +			}
> +			break;
> +		}
> +
> +		if (it_pool[it_idx].count) {
> +			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
> +
> +			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
> +				&it_pool[it_idx + 1]);
> +			pkts_info[slot_idx].mbuf = pkt;
> +			async_pkts_log[nr_async_pkts++].last_avail_idx =
> +				vq->last_avail_idx;
> +			nr_async_burst++;
> +			iovec_idx += it_pool[it_idx].nr_segs;
> +			it_idx += 2;
> +			segs_await += it_pool[it_idx].nr_segs;
> +
> +			/* keep used desc */
> +			vq->async_descs_split[to].id = head_idx;
> +			vq->async_descs_split[to].len = 0;
> +			vq->async_desc_idx_split++;
> +		} else {
> +			update_shadow_used_ring_split(vq, head_idx, 0);
> +			pkts[nr_done_pkts++] = pkt;
> +		}
> +
> +		vq->last_avail_idx++;
> +
> +		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
> +					((VHOST_MAX_ASYNC_VEC >> 1) -
> +					 segs_await < BUF_VECTOR_MAX))) {
> +			uint16_t nr_pkts;
> +
> +			nr_pkts = vq->async_ops.transfer_data(dev->vid,
> +					queue_id, tdes, 0, nr_async_burst);
> +			src_iovec = vec_pool;
> +			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> +			it_idx = 0;
> +			segs_await = 0;
> +			vq->async_pkts_inflight_n += nr_pkts;
> +
> +			if (unlikely(nr_pkts < nr_async_burst)) {
> +				pkt_err = nr_async_burst - nr_pkts;
> +				nr_async_burst = 0;
> +				break;
> +			}
> +			nr_async_burst = 0;
> +		}
> +	}
> +
> +	if (nr_async_burst) {
> +		uint32_t nr_pkts;
> +
> +		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
> +				tdes, 0, nr_async_burst);
> +		vq->async_pkts_inflight_n += nr_pkts;
> +
> +		if (unlikely(nr_pkts < nr_async_burst))
> +			pkt_err = nr_async_burst - nr_pkts;
> +	}
> +
> +	do_data_copy_dequeue(vq);
> +
> +	if (unlikely(pkt_err)) {
> +		uint16_t nr_err_dma = pkt_err;
> +		uint16_t nr_err_sw;
> +
> +		nr_async_pkts -= nr_err_dma;
> +
> +		/**
> +		 * revert shadow used ring and free pktmbufs for
> +		 * CPU-copied pkts after the first DMA-error pkt.
> +		 */
> +		nr_err_sw = vq->last_avail_idx -
> +			async_pkts_log[nr_async_pkts].last_avail_idx -
> +			nr_err_dma;
> +		vq->shadow_used_idx -= nr_err_sw;
> +		while (nr_err_sw-- > 0)
> +			rte_pktmbuf_free(pkts[--nr_done_pkts]);
> +
> +		/**
> +		 * recover DMA-copy related structures and free pktmbufs
> +		 * for DMA-error pkts.
> +		 */
> +		vq->async_desc_idx_split -= nr_err_dma;
> +		while (nr_err_dma-- > 0) {
> +			rte_pktmbuf_free(
> +				pkts_info[slot_idx & (vq->size - 1)].mbuf);
> +			slot_idx--;
> +		}
> +
> +		/* recover available ring */
> +		vq->last_avail_idx =
> +			async_pkts_log[nr_async_pkts].last_avail_idx;
> +	}
> +
> +	vq->async_pkts_idx += nr_async_pkts;
> +
> +	if (likely(vq->shadow_used_idx))
> +		flush_shadow_used_ring_split(dev, vq);
> +
> +out:
> +	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
> +		nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
> +					queue_id, &pkts[nr_done_pkts],
> +					count - nr_done_pkts,
> +					legacy_ol_flags);
> +		nr_done_pkts += nr_async_cmpl_pkts;
> +	}
> +	if (likely(nr_done_pkts))
> +		vhost_vring_call_split(dev, vq);
> +
> +	return nr_done_pkts;
> +}
> +
> +__rte_noinline
> +static uint16_t
> +virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
> +		struct vhost_virtqueue *vq, uint16_t queue_id,
> +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
> +		uint16_t count)
> +{
> +	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
> +				pkts, count, true);

I think we don't need to support legacy offload.
It may be better to have the Vhost example to support the compliant way,
what do you think?

> +}
> +
> +__rte_noinline
> +static uint16_t
> +virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
> +		struct vhost_virtqueue *vq, uint16_t queue_id,
> +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
> +		uint16_t count)
> +{
> +	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
> +				pkts, count, false);
> +}
> +
> +uint16_t
> +rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
> +	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
> +	int *nr_inflight)
> +{
> +	struct virtio_net *dev;
> +	struct rte_mbuf *rarp_mbuf = NULL;
> +	struct vhost_virtqueue *vq;
> +	int16_t success = 1;
> +
> +	*nr_inflight = -1;
> +
> +	dev = get_device(vid);
> +	if (!dev)
> +		return 0;
> +
> +	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
> +		VHOST_LOG_DATA(ERR,
> +			"(%d) %s: built-in vhost net backend is disabled.\n",
> +			dev->vid, __func__);
> +		return 0;
> +	}
> +
> +	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
> +		VHOST_LOG_DATA(ERR,
> +			"(%d) %s: invalid virtqueue idx %d.\n",
> +			dev->vid, __func__, queue_id);
> +		return 0;
> +	}
> +
> +	vq = dev->virtqueue[queue_id];
> +
> +	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
> +		return 0;
> +
> +	if (unlikely(vq->enabled == 0)) {
> +		count = 0;
> +		goto out_access_unlock;
> +	}
> +
> +	if (unlikely(!vq->async_registered)) {
> +		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
> +			dev->vid, __func__, queue_id);
> +		count = 0;
> +		goto out_access_unlock;
> +	}
> +
> +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> +		vhost_user_iotlb_rd_lock(vq);
> +
> +	if (unlikely(vq->access_ok == 0))
> +		if (unlikely(vring_translate(dev, vq) < 0)) {
> +			count = 0;
> +			goto out_access_unlock;
> +		}
> +
> +	/*
> +	 * Construct a RARP broadcast packet, and inject it to the "pkts"
> +	 * array, to looks like that guest actually send such packet.
> +	 *
> +	 * Check user_send_rarp() for more information.
> +	 *
> +	 * broadcast_rarp shares a cacheline in the virtio_net structure
> +	 * with some fields that are accessed during enqueue and
> +	 * __atomic_compare_exchange_n causes a write if performed compare
> +	 * and exchange. This could result in false sharing between enqueue
> +	 * and dequeue.
> +	 *
> +	 * Prevent unnecessary false sharing by reading broadcast_rarp first
> +	 * and only performing compare and exchange if the read indicates it
> +	 * is likely to be set.
> +	 */
> +	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
> +			__atomic_compare_exchange_n(&dev->broadcast_rarp,
> +			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
> +
> +		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
> +		if (rarp_mbuf == NULL) {
> +			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
> +			count = 0;
> +			goto out;
> +		}
> +		count -= 1;
> +	}
> +
> +	if (unlikely(vq_is_packed(dev)))
> +		return 0;
> +
> +	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
> +		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
> +				mbuf_pool, pkts, count);
> +	else
> +		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
> +				mbuf_pool, pkts, count);
> +
> +out:
> +	*nr_inflight = vq->async_pkts_inflight_n;
> +
> +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> +		vhost_user_iotlb_rd_unlock(vq);
> +
> +out_access_unlock:
> +	rte_spinlock_unlock(&vq->access_lock);
> +
> +	if (unlikely(rarp_mbuf != NULL)) {
> +		/*
> +		 * Inject it to the head of "pkts" array, so that switch's mac
> +		 * learning table will get updated first.
> +		 */
> +		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
> +		pkts[0] = rarp_mbuf;
> +		count += 1;
> +	}
> +
> +	return count;
> +}
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 4/4] examples/vhost: support vhost async dequeue data path
  2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
@ 2021-07-13 17:01     ` Maxime Coquelin
  0 siblings, 0 replies; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-13 17:01 UTC (permalink / raw)
  To: Wenwu Ma, dev; +Cc: chenbo.xia, cheng1.jiang, jiayu.hu



On 7/5/21 8:11 PM, Wenwu Ma wrote:
> This patch is to add vhost async dequeue data-path in vhost sample.
> vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

"
This patch adds support for async dequeue path to Vhost example.
"

> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> ---
>  doc/guides/sample_app_ug/vhost.rst |   9 +-
>  examples/vhost/ioat.c              |  61 ++++++++++---
>  examples/vhost/ioat.h              |  25 ++++++
>  examples/vhost/main.c              | 140 ++++++++++++++++++++---------
>  4 files changed, 177 insertions(+), 58 deletions(-)
> 
> diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
> index 9afde9c7f5..63dcf181e1 100644
> --- a/doc/guides/sample_app_ug/vhost.rst
> +++ b/doc/guides/sample_app_ug/vhost.rst
> @@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
>  **--dmas**
>  This parameter is used to specify the assigned DMA device of a vhost device.
>  Async vhost-user net driver will be used if --dmas is set. For example
> ---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
> -device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
> -enqueue operation.
> +--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
> +DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
> +and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
> +operation. The index of the device corresponds to the socket file in order,
> +that means vhost device 0 is created through the first socket file, vhost
> +device 1 is created through the second socket file, and so on.
>  
>  Common Issues
>  -------------
> diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
> index bf4e033bdb..a305100b47 100644
> --- a/examples/vhost/ioat.c
> +++ b/examples/vhost/ioat.c
> @@ -21,6 +21,8 @@ struct packet_tracker {
>  
>  struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
>  
> +int vid2socketid[MAX_VHOST_DEVICE];
> +
>  int
>  open_ioat(const char *value)
>  {
> @@ -29,7 +31,7 @@ open_ioat(const char *value)
>  	char *addrs = input;
>  	char *ptrs[2];
>  	char *start, *end, *substr;
> -	int64_t vid, vring_id;
> +	int64_t socketid, vring_id;
>  	struct rte_ioat_rawdev_config config;
>  	struct rte_rawdev_info info = { .dev_private = &config };
>  	char name[32];
> @@ -60,6 +62,8 @@ open_ioat(const char *value)
>  		goto out;
>  	}
>  	while (i < args_nr) {
> +		char *txd, *rxd;
> +		bool is_txd;
>  		char *arg_temp = dma_arg[i];
>  		uint8_t sub_nr;
>  		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
> @@ -68,27 +72,38 @@ open_ioat(const char *value)
>  			goto out;
>  		}
>  
> -		start = strstr(ptrs[0], "txd");
> -		if (start == NULL) {
> +		int async_flag;
> +		txd = strstr(ptrs[0], "txd");
> +		rxd = strstr(ptrs[0], "rxd");
> +		if (txd == NULL && rxd == NULL) {
>  			ret = -1;
>  			goto out;
> +		} else if (txd) {
> +			is_txd = true;
> +			start = txd;
> +			async_flag = ASYNC_RX_VHOST;

That's confusing to set ASYNC_RX_VHOST flag when txd is present.
IIUC, this is about the enqueue path, so TX from Vhost point of view.
So either name the flag ASYNC_TX_VHOST or ASYNV_ENQUEUE_VHOST?

> +		} else {
> +			is_txd = false;
> +			start = rxd;
> +			async_flag = ASYNC_TX_VHOST;
>  		}

What if both are set by the user? you might want to add a check.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-13 14:30     ` Maxime Coquelin
@ 2021-07-14  6:50       ` Hu, Jiayu
  2021-07-15 13:18         ` Maxime Coquelin
  2021-07-16  8:14         ` David Marchand
  0 siblings, 2 replies; 50+ messages in thread
From: Hu, Jiayu @ 2021-07-14  6:50 UTC (permalink / raw)
  To: Maxime Coquelin, Ma, WenwuX, dev; +Cc: Xia, Chenbo, Jiang, Cheng1, Wang, YuanX

Hi Maxime,

Thanks for your comments. Applies are inline.

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, July 13, 2021 10:30 PM
> To: Ma, WenwuX <wenwux.ma@intel.com>; dev@dpdk.org
> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>
> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split ring
> >  struct async_inflight_info {
> >  	struct rte_mbuf *mbuf;
> > -	uint16_t descs; /* num of descs inflight */
> > +	union {
> > +		uint16_t descs; /* num of descs in-flight */
> > +		struct async_nethdr nethdr;
> > +	};
> >  	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> > -};
> > +} __rte_cache_aligned;
> 
> Does it really need to be cache aligned?

How about changing to 32-byte align? So a cacheline can hold 2 objects.

> 
> >
> >  /**
> >   *  dma channel feature bit definition @@ -193,4 +201,34 @@
> > __rte_experimental  uint16_t rte_vhost_poll_enqueue_completed(int vid,
> > uint16_t queue_id,
> >  		struct rte_mbuf **pkts, uint16_t count);
> >
> > +/**
> > + * This function tries to receive packets from the guest with
> > +offloading
> > + * large copies to the DMA engine. Successfully dequeued packets are
> > + * transfer completed, either by the CPU or the DMA engine, and they
> > +are
> > + * returned in "pkts". There may be other packets that are sent from
> > + * the guest but being transferred by the DMA engine, called
> > +in-flight
> > + * packets. The amount of in-flight packets by now is returned in
> > + * "nr_inflight". This function will return in-flight packets only
> > +after
> > + * the DMA engine finishes transferring.
> 
> I am not sure to understand that comment. Is it still "in-flight" if the DMA
> transfer is completed?

"in-flight" means packet copies are submitted to the DMA, but the DMA hasn't
completed copies.

> 
> Are we ensuring packets are not reordered with this way of working?

There is a threshold can be set by users. If set it to 0, which presents all
packet copies assigned to the DMA, the packets sent from the guest will
not be reordered.

> 
> > + *
> > + * @param vid
> > + *  id of vhost device to dequeue data
> > + * @param queue_id
> > + *  queue id to dequeue data
> > + * @param pkts
> > + *  blank array to keep successfully dequeued packets
> > + * @param count
> > + *  size of the packet array
> > + * @param nr_inflight
> > + *  the amount of in-flight packets by now. If error occurred, its
> > + *  value is set to -1.
> > + * @return
> > + *  num of successfully dequeued packets  */ __rte_experimental
> > +uint16_t rte_vhost_async_try_dequeue_burst(int vid, uint16_t
> > +queue_id,
> > +	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count,
> > +	int *nr_inflight);
> > +
> >  #endif /* _RTE_VHOST_ASYNC_H_ */
> > diff --git a/lib/vhost/version.map b/lib/vhost/version.map index
> > 9103a23cd4..a320f889cd 100644
> > --- a/lib/vhost/version.map
> > +++ b/lib/vhost/version.map
> > @@ -79,4 +79,7 @@ EXPERIMENTAL {
> >
> >  	# added in 21.05
> >  	rte_vhost_get_negotiated_protocol_features;
> > +
> > +	# added in 21.08
> > +	rte_vhost_async_try_dequeue_burst;
> >  };
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
> > b93482587c..52237e8600 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -2673,6 +2673,32 @@ virtio_dev_pktmbuf_prep(struct virtio_net *dev,
> struct rte_mbuf *pkt,
> >  	return -1;
> >  }
> >
> > +/*
> > + * Allocate a host supported pktmbuf.
> > + */
> > +static __rte_always_inline struct rte_mbuf *
> > +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
> > +			 uint32_t data_len)
> > +{
> > +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> > +
> > +	if (unlikely(pkt == NULL)) {
> > +		VHOST_LOG_DATA(ERR,
> > +			"Failed to allocate memory for mbuf.\n");
> > +		return NULL;
> > +	}
> > +
> > +	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
> > +		/* Data doesn't fit into the buffer and the host supports
> > +		 * only linear buffers
> > +		 */
> > +		rte_pktmbuf_free(pkt);
> > +		return NULL;
> > +	}
> > +
> > +	return pkt;
> > +}
> > +
> 
> I think you should be able to use rte_pktmbuf_alloc_bulk and
> virtio_dev_pktmbuf_prep instead of re-introducing the function that was
> removed by Balazs. It should help perf a bit.
> 
> >  __rte_always_inline
> >  static uint16_t
> >  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue
> > *vq, @@ -3147,3 +3173,578 @@ rte_vhost_dequeue_burst(int vid,
> uint16_t
> > queue_id,
> >
> >  	return count;
> >  }
> > +
> > +
> > +static __rte_always_inline uint16_t
> > +virtio_dev_tx_async_split(struct virtio_net *dev,
> > +		struct vhost_virtqueue *vq, uint16_t queue_id,
> > +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
> > +		uint16_t count, bool legacy_ol_flags) {
> > +	static bool allocerr_warned;
> > +	uint16_t pkt_idx;
> > +	uint16_t free_entries;
> > +	uint16_t slot_idx = 0;
> > +	uint16_t segs_await = 0;
> > +	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts =
> 0;
> > +	uint16_t nr_async_burst = 0;
> > +	uint16_t pkt_err = 0;
> > +	uint16_t iovec_idx = 0, it_idx = 0;
> > +
> > +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> > +	struct iovec *vec_pool = vq->vec_pool;
> > +	struct iovec *src_iovec = vec_pool;
> > +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
> > +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
> > +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> > +
> > +	struct async_pkt_index {
> > +		uint16_t last_avail_idx;
> > +	} async_pkts_log[MAX_PKT_BURST];
> > +
> > +	/**
> > +	 * The ordering between avail index and
> > +	 * desc reads needs to be enforced.
> > +	 */
> > +	free_entries = __atomic_load_n(&vq->avail->idx,
> __ATOMIC_ACQUIRE) -
> > +			vq->last_avail_idx;
> > +	if (free_entries == 0)
> > +		goto out;
> > +
> > +	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size -
> > +1)]);
> > +
> > +	count = RTE_MIN(count, MAX_PKT_BURST);
> > +	count = RTE_MIN(count, free_entries);
> > +	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
> > +			dev->vid, count);
> > +
> > +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> > +		uint16_t head_idx = 0;
> > +		uint16_t nr_vec = 0;
> > +		uint32_t buf_len;
> > +		int err;
> > +		struct buf_vector buf_vec[BUF_VECTOR_MAX];
> > +		struct rte_mbuf *pkt;
> > +
> > +		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
> > +						&nr_vec, buf_vec,
> > +						&head_idx, &buf_len,
> > +						VHOST_ACCESS_RO) < 0))
> > +			break;
> > +
> > +		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
> > +		if (unlikely(pkt == NULL)) {
> > +			/**
> > +			 * mbuf allocation fails for jumbo packets when
> external
> > +			 * buffer allocation is not allowed and linear buffer
> > +			 * is required. Drop this packet.
> > +			 */
> > +			if (!allocerr_warned) {
> > +				VHOST_LOG_DATA(ERR,
> > +					"Failed mbuf alloc of size %d from %s
> on %s.\n",
> > +					buf_len, mbuf_pool->name, dev-
> >ifname);
> > +				allocerr_warned = true;
> > +			}
> > +			break;
> > +		}
> > +
> > +		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
> > +				(vq->size - 1);
> > +		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
> > +				mbuf_pool, &src_iovec[iovec_idx],
> > +				&dst_iovec[iovec_idx], &it_pool[it_idx],
> > +				&it_pool[it_idx + 1],
> > +				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
> > +		if (unlikely(err)) {
> > +			rte_pktmbuf_free(pkt);
> > +			if (!allocerr_warned) {
> > +				VHOST_LOG_DATA(ERR,
> > +					"Failed to copy desc to mbuf
> on %s.\n",
> > +					dev->ifname);
> > +				allocerr_warned = true;
> > +			}
> > +			break;
> > +		}
> > +
> > +		if (it_pool[it_idx].count) {
> > +			uint16_t to = vq->async_desc_idx_split & (vq->size -
> 1);
> > +
> > +			async_fill_desc(&tdes[nr_async_burst],
> &it_pool[it_idx],
> > +				&it_pool[it_idx + 1]);
> > +			pkts_info[slot_idx].mbuf = pkt;
> > +			async_pkts_log[nr_async_pkts++].last_avail_idx =
> > +				vq->last_avail_idx;
> > +			nr_async_burst++;
> > +			iovec_idx += it_pool[it_idx].nr_segs;
> > +			it_idx += 2;
> > +			segs_await += it_pool[it_idx].nr_segs;
> > +
> > +			/* keep used desc */
> > +			vq->async_descs_split[to].id = head_idx;
> > +			vq->async_descs_split[to].len = 0;
> > +			vq->async_desc_idx_split++;
> > +		} else {
> > +			update_shadow_used_ring_split(vq, head_idx, 0);
> > +			pkts[nr_done_pkts++] = pkt;
> > +		}
> > +
> > +		vq->last_avail_idx++;
> > +
> > +		if (unlikely((nr_async_burst >=
> VHOST_ASYNC_BATCH_THRESHOLD) ||
> > +					((VHOST_MAX_ASYNC_VEC >> 1) -
> > +					 segs_await < BUF_VECTOR_MAX))) {
> > +			uint16_t nr_pkts;
> > +
> > +			nr_pkts = vq->async_ops.transfer_data(dev->vid,
> > +					queue_id, tdes, 0, nr_async_burst);
> > +			src_iovec = vec_pool;
> > +			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> > +			it_idx = 0;
> > +			segs_await = 0;
> > +			vq->async_pkts_inflight_n += nr_pkts;
> > +
> > +			if (unlikely(nr_pkts < nr_async_burst)) {
> > +				pkt_err = nr_async_burst - nr_pkts;
> > +				nr_async_burst = 0;
> > +				break;
> > +			}
> > +			nr_async_burst = 0;
> > +		}
> > +	}
> > +
> > +	if (nr_async_burst) {
> > +		uint32_t nr_pkts;
> > +
> > +		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
> > +				tdes, 0, nr_async_burst);
> > +		vq->async_pkts_inflight_n += nr_pkts;
> > +
> > +		if (unlikely(nr_pkts < nr_async_burst))
> > +			pkt_err = nr_async_burst - nr_pkts;
> > +	}
> > +
> > +	do_data_copy_dequeue(vq);
> > +
> > +	if (unlikely(pkt_err)) {
> > +		uint16_t nr_err_dma = pkt_err;
> > +		uint16_t nr_err_sw;
> > +
> > +		nr_async_pkts -= nr_err_dma;
> > +
> > +		/**
> > +		 * revert shadow used ring and free pktmbufs for
> > +		 * CPU-copied pkts after the first DMA-error pkt.
> > +		 */
> > +		nr_err_sw = vq->last_avail_idx -
> > +			async_pkts_log[nr_async_pkts].last_avail_idx -
> > +			nr_err_dma;
> > +		vq->shadow_used_idx -= nr_err_sw;
> > +		while (nr_err_sw-- > 0)
> > +			rte_pktmbuf_free(pkts[--nr_done_pkts]);
> > +
> > +		/**
> > +		 * recover DMA-copy related structures and free pktmbufs
> > +		 * for DMA-error pkts.
> > +		 */
> > +		vq->async_desc_idx_split -= nr_err_dma;
> > +		while (nr_err_dma-- > 0) {
> > +			rte_pktmbuf_free(
> > +				pkts_info[slot_idx & (vq->size - 1)].mbuf);
> > +			slot_idx--;
> > +		}
> > +
> > +		/* recover available ring */
> > +		vq->last_avail_idx =
> > +			async_pkts_log[nr_async_pkts].last_avail_idx;
> > +	}
> > +
> > +	vq->async_pkts_idx += nr_async_pkts;
> > +
> > +	if (likely(vq->shadow_used_idx))
> > +		flush_shadow_used_ring_split(dev, vq);
> > +
> > +out:
> > +	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
> > +		nr_async_cmpl_pkts =
> async_poll_dequeue_completed_split(dev, vq,
> > +					queue_id, &pkts[nr_done_pkts],
> > +					count - nr_done_pkts,
> > +					legacy_ol_flags);
> > +		nr_done_pkts += nr_async_cmpl_pkts;
> > +	}
> > +	if (likely(nr_done_pkts))
> > +		vhost_vring_call_split(dev, vq);
> > +
> > +	return nr_done_pkts;
> > +}
> > +
> > +__rte_noinline
> > +static uint16_t
> > +virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
> > +		struct vhost_virtqueue *vq, uint16_t queue_id,
> > +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
> > +		uint16_t count)
> > +{
> > +	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
> > +				pkts, count, true);
> 
> I think we don't need to support legacy offload.
> It may be better to have the Vhost example to support the compliant way,
> what do you think?

The legacy offload is disabled by RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS,
and compliant mode is disabled by default. If we don't implement legacy mode in
the async dequeue code, how to handle the case that users don't set the flag?

Thanks,
Jiayu

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-14  6:50       ` Hu, Jiayu
@ 2021-07-15 13:18         ` Maxime Coquelin
  2021-07-16  1:10           ` Hu, Jiayu
  2021-07-16  8:14         ` David Marchand
  1 sibling, 1 reply; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-15 13:18 UTC (permalink / raw)
  To: Hu, Jiayu, Ma, WenwuX, dev; +Cc: Xia, Chenbo, Jiang, Cheng1, Wang, YuanX



On 7/14/21 8:50 AM, Hu, Jiayu wrote:
> Hi Maxime,
> 
> Thanks for your comments. Applies are inline.
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Tuesday, July 13, 2021 10:30 PM
>> To: Ma, WenwuX <wenwux.ma@intel.com>; dev@dpdk.org
>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
>> <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Wang, YuanX
>> <yuanx.wang@intel.com>
>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split ring
>>>  struct async_inflight_info {
>>>  	struct rte_mbuf *mbuf;
>>> -	uint16_t descs; /* num of descs inflight */
>>> +	union {
>>> +		uint16_t descs; /* num of descs in-flight */
>>> +		struct async_nethdr nethdr;
>>> +	};
>>>  	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>>> -};
>>> +} __rte_cache_aligned;
>>
>> Does it really need to be cache aligned?
> 
> How about changing to 32-byte align? So a cacheline can hold 2 objects.

Or not forcing any alignment at all? Would there really be a performance
regression?

>>
>>>
>>>  /**
>>>   *  dma channel feature bit definition @@ -193,4 +201,34 @@
>>> __rte_experimental  uint16_t rte_vhost_poll_enqueue_completed(int vid,
>>> uint16_t queue_id,
>>>  		struct rte_mbuf **pkts, uint16_t count);
>>>
>>> +/**
>>> + * This function tries to receive packets from the guest with
>>> +offloading
>>> + * large copies to the DMA engine. Successfully dequeued packets are
>>> + * transfer completed, either by the CPU or the DMA engine, and they
>>> +are
>>> + * returned in "pkts". There may be other packets that are sent from
>>> + * the guest but being transferred by the DMA engine, called
>>> +in-flight
>>> + * packets. The amount of in-flight packets by now is returned in
>>> + * "nr_inflight". This function will return in-flight packets only
>>> +after
>>> + * the DMA engine finishes transferring.
>>
>> I am not sure to understand that comment. Is it still "in-flight" if the DMA
>> transfer is completed?
> 
> "in-flight" means packet copies are submitted to the DMA, but the DMA hasn't
> completed copies.
> 
>>
>> Are we ensuring packets are not reordered with this way of working?
> 
> There is a threshold can be set by users. If set it to 0, which presents all
> packet copies assigned to the DMA, the packets sent from the guest will
> not be reordered.

Reordering packets is bad in my opinion. We cannot expect the user to
know that he should set the threshold to zero to have packets ordered.

Maybe we should consider not having threshold, and so have every
descriptors handled either by the CPU (sync datapath) or by the DMA
(async datapath). Doing so would simplify a lot the code, and would make
performance/latency more predictable.

I understand that we might not get the best performance for every packet
size doing that, but that may be a tradeoff we would make to have the
feature maintainable and easily useable by the user.

>>
>>> + *
>>> + * @param vid
>>> + *  id of vhost device to dequeue data
>>> + * @param queue_id
>>> + *  queue id to dequeue data
>>> + * @param pkts
>>> + *  blank array to keep successfully dequeued packets
>>> + * @param count
>>> + *  size of the packet array
>>> + * @param nr_inflight
>>> + *  the amount of in-flight packets by now. If error occurred, its
>>> + *  value is set to -1.
>>> + * @return
>>> + *  num of successfully dequeued packets  */ __rte_experimental
>>> +uint16_t rte_vhost_async_try_dequeue_burst(int vid, uint16_t
>>> +queue_id,
>>> +	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
>> count,
>>> +	int *nr_inflight);
>>> +
>>>  #endif /* _RTE_VHOST_ASYNC_H_ */
>>> diff --git a/lib/vhost/version.map b/lib/vhost/version.map index
>>> 9103a23cd4..a320f889cd 100644
>>> --- a/lib/vhost/version.map
>>> +++ b/lib/vhost/version.map
>>> @@ -79,4 +79,7 @@ EXPERIMENTAL {
>>>
>>>  	# added in 21.05
>>>  	rte_vhost_get_negotiated_protocol_features;
>>> +
>>> +	# added in 21.08
>>> +	rte_vhost_async_try_dequeue_burst;
>>>  };
>>> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
>>> b93482587c..52237e8600 100644
>>> --- a/lib/vhost/virtio_net.c
>>> +++ b/lib/vhost/virtio_net.c
>>> @@ -2673,6 +2673,32 @@ virtio_dev_pktmbuf_prep(struct virtio_net *dev,
>> struct rte_mbuf *pkt,
>>>  	return -1;
>>>  }
>>>
>>> +/*
>>> + * Allocate a host supported pktmbuf.
>>> + */
>>> +static __rte_always_inline struct rte_mbuf *
>>> +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
>>> +			 uint32_t data_len)
>>> +{
>>> +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
>>> +
>>> +	if (unlikely(pkt == NULL)) {
>>> +		VHOST_LOG_DATA(ERR,
>>> +			"Failed to allocate memory for mbuf.\n");
>>> +		return NULL;
>>> +	}
>>> +
>>> +	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
>>> +		/* Data doesn't fit into the buffer and the host supports
>>> +		 * only linear buffers
>>> +		 */
>>> +		rte_pktmbuf_free(pkt);
>>> +		return NULL;
>>> +	}
>>> +
>>> +	return pkt;
>>> +}
>>> +
>>
>> I think you should be able to use rte_pktmbuf_alloc_bulk and
>> virtio_dev_pktmbuf_prep instead of re-introducing the function that was
>> removed by Balazs. It should help perf a bit.
>>
>>>  __rte_always_inline
>>>  static uint16_t
>>>  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue
>>> *vq, @@ -3147,3 +3173,578 @@ rte_vhost_dequeue_burst(int vid,
>> uint16_t
>>> queue_id,
>>>
>>>  	return count;
>>>  }
>>> +
>>> +
>>> +static __rte_always_inline uint16_t
>>> +virtio_dev_tx_async_split(struct virtio_net *dev,
>>> +		struct vhost_virtqueue *vq, uint16_t queue_id,
>>> +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
>>> +		uint16_t count, bool legacy_ol_flags) {
>>> +	static bool allocerr_warned;
>>> +	uint16_t pkt_idx;
>>> +	uint16_t free_entries;
>>> +	uint16_t slot_idx = 0;
>>> +	uint16_t segs_await = 0;
>>> +	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts =
>> 0;
>>> +	uint16_t nr_async_burst = 0;
>>> +	uint16_t pkt_err = 0;
>>> +	uint16_t iovec_idx = 0, it_idx = 0;
>>> +
>>> +	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
>>> +	struct iovec *vec_pool = vq->vec_pool;
>>> +	struct iovec *src_iovec = vec_pool;
>>> +	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
>>> +	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
>>> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>>> +
>>> +	struct async_pkt_index {
>>> +		uint16_t last_avail_idx;
>>> +	} async_pkts_log[MAX_PKT_BURST];
>>> +
>>> +	/**
>>> +	 * The ordering between avail index and
>>> +	 * desc reads needs to be enforced.
>>> +	 */
>>> +	free_entries = __atomic_load_n(&vq->avail->idx,
>> __ATOMIC_ACQUIRE) -
>>> +			vq->last_avail_idx;
>>> +	if (free_entries == 0)
>>> +		goto out;
>>> +
>>> +	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size -
>>> +1)]);
>>> +
>>> +	count = RTE_MIN(count, MAX_PKT_BURST);
>>> +	count = RTE_MIN(count, free_entries);
>>> +	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
>>> +			dev->vid, count);
>>> +
>>> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
>>> +		uint16_t head_idx = 0;
>>> +		uint16_t nr_vec = 0;
>>> +		uint32_t buf_len;
>>> +		int err;
>>> +		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>>> +		struct rte_mbuf *pkt;
>>> +
>>> +		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
>>> +						&nr_vec, buf_vec,
>>> +						&head_idx, &buf_len,
>>> +						VHOST_ACCESS_RO) < 0))
>>> +			break;
>>> +
>>> +		pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
>>> +		if (unlikely(pkt == NULL)) {
>>> +			/**
>>> +			 * mbuf allocation fails for jumbo packets when
>> external
>>> +			 * buffer allocation is not allowed and linear buffer
>>> +			 * is required. Drop this packet.
>>> +			 */
>>> +			if (!allocerr_warned) {
>>> +				VHOST_LOG_DATA(ERR,
>>> +					"Failed mbuf alloc of size %d from %s
>> on %s.\n",
>>> +					buf_len, mbuf_pool->name, dev-
>>> ifname);
>>> +				allocerr_warned = true;
>>> +			}
>>> +			break;
>>> +		}
>>> +
>>> +		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
>>> +				(vq->size - 1);
>>> +		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
>>> +				mbuf_pool, &src_iovec[iovec_idx],
>>> +				&dst_iovec[iovec_idx], &it_pool[it_idx],
>>> +				&it_pool[it_idx + 1],
>>> +				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
>>> +		if (unlikely(err)) {
>>> +			rte_pktmbuf_free(pkt);
>>> +			if (!allocerr_warned) {
>>> +				VHOST_LOG_DATA(ERR,
>>> +					"Failed to copy desc to mbuf
>> on %s.\n",
>>> +					dev->ifname);
>>> +				allocerr_warned = true;
>>> +			}
>>> +			break;
>>> +		}
>>> +
>>> +		if (it_pool[it_idx].count) {
>>> +			uint16_t to = vq->async_desc_idx_split & (vq->size -
>> 1);
>>> +
>>> +			async_fill_desc(&tdes[nr_async_burst],
>> &it_pool[it_idx],
>>> +				&it_pool[it_idx + 1]);
>>> +			pkts_info[slot_idx].mbuf = pkt;
>>> +			async_pkts_log[nr_async_pkts++].last_avail_idx =
>>> +				vq->last_avail_idx;
>>> +			nr_async_burst++;
>>> +			iovec_idx += it_pool[it_idx].nr_segs;
>>> +			it_idx += 2;
>>> +			segs_await += it_pool[it_idx].nr_segs;
>>> +
>>> +			/* keep used desc */
>>> +			vq->async_descs_split[to].id = head_idx;
>>> +			vq->async_descs_split[to].len = 0;
>>> +			vq->async_desc_idx_split++;
>>> +		} else {
>>> +			update_shadow_used_ring_split(vq, head_idx, 0);
>>> +			pkts[nr_done_pkts++] = pkt;
>>> +		}
>>> +
>>> +		vq->last_avail_idx++;
>>> +
>>> +		if (unlikely((nr_async_burst >=
>> VHOST_ASYNC_BATCH_THRESHOLD) ||
>>> +					((VHOST_MAX_ASYNC_VEC >> 1) -
>>> +					 segs_await < BUF_VECTOR_MAX))) {
>>> +			uint16_t nr_pkts;
>>> +
>>> +			nr_pkts = vq->async_ops.transfer_data(dev->vid,
>>> +					queue_id, tdes, 0, nr_async_burst);
>>> +			src_iovec = vec_pool;
>>> +			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
>> 1);
>>> +			it_idx = 0;
>>> +			segs_await = 0;
>>> +			vq->async_pkts_inflight_n += nr_pkts;
>>> +
>>> +			if (unlikely(nr_pkts < nr_async_burst)) {
>>> +				pkt_err = nr_async_burst - nr_pkts;
>>> +				nr_async_burst = 0;
>>> +				break;
>>> +			}
>>> +			nr_async_burst = 0;
>>> +		}
>>> +	}
>>> +
>>> +	if (nr_async_burst) {
>>> +		uint32_t nr_pkts;
>>> +
>>> +		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
>>> +				tdes, 0, nr_async_burst);
>>> +		vq->async_pkts_inflight_n += nr_pkts;
>>> +
>>> +		if (unlikely(nr_pkts < nr_async_burst))
>>> +			pkt_err = nr_async_burst - nr_pkts;
>>> +	}
>>> +
>>> +	do_data_copy_dequeue(vq);
>>> +
>>> +	if (unlikely(pkt_err)) {
>>> +		uint16_t nr_err_dma = pkt_err;
>>> +		uint16_t nr_err_sw;
>>> +
>>> +		nr_async_pkts -= nr_err_dma;
>>> +
>>> +		/**
>>> +		 * revert shadow used ring and free pktmbufs for
>>> +		 * CPU-copied pkts after the first DMA-error pkt.
>>> +		 */
>>> +		nr_err_sw = vq->last_avail_idx -
>>> +			async_pkts_log[nr_async_pkts].last_avail_idx -
>>> +			nr_err_dma;
>>> +		vq->shadow_used_idx -= nr_err_sw;
>>> +		while (nr_err_sw-- > 0)
>>> +			rte_pktmbuf_free(pkts[--nr_done_pkts]);
>>> +
>>> +		/**
>>> +		 * recover DMA-copy related structures and free pktmbufs
>>> +		 * for DMA-error pkts.
>>> +		 */
>>> +		vq->async_desc_idx_split -= nr_err_dma;
>>> +		while (nr_err_dma-- > 0) {
>>> +			rte_pktmbuf_free(
>>> +				pkts_info[slot_idx & (vq->size - 1)].mbuf);
>>> +			slot_idx--;
>>> +		}
>>> +
>>> +		/* recover available ring */
>>> +		vq->last_avail_idx =
>>> +			async_pkts_log[nr_async_pkts].last_avail_idx;
>>> +	}
>>> +
>>> +	vq->async_pkts_idx += nr_async_pkts;
>>> +
>>> +	if (likely(vq->shadow_used_idx))
>>> +		flush_shadow_used_ring_split(dev, vq);
>>> +
>>> +out:
>>> +	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
>>> +		nr_async_cmpl_pkts =
>> async_poll_dequeue_completed_split(dev, vq,
>>> +					queue_id, &pkts[nr_done_pkts],
>>> +					count - nr_done_pkts,
>>> +					legacy_ol_flags);
>>> +		nr_done_pkts += nr_async_cmpl_pkts;
>>> +	}
>>> +	if (likely(nr_done_pkts))
>>> +		vhost_vring_call_split(dev, vq);
>>> +
>>> +	return nr_done_pkts;
>>> +}
>>> +
>>> +__rte_noinline
>>> +static uint16_t
>>> +virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
>>> +		struct vhost_virtqueue *vq, uint16_t queue_id,
>>> +		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
>>> +		uint16_t count)
>>> +{
>>> +	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
>>> +				pkts, count, true);
>>
>> I think we don't need to support legacy offload.
>> It may be better to have the Vhost example to support the compliant way,
>> what do you think?
> 
> The legacy offload is disabled by RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS,
> and compliant mode is disabled by default. If we don't implement legacy mode in
> the async dequeue code, how to handle the case that users don't set the flag?

Ok, that's a valid point. We seem to have no choice than supporting it.

Thanks,
Maxime

> Thanks,
> Jiayu
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-15 13:18         ` Maxime Coquelin
@ 2021-07-16  1:10           ` Hu, Jiayu
  2021-07-16  7:45             ` Maxime Coquelin
  0 siblings, 1 reply; 50+ messages in thread
From: Hu, Jiayu @ 2021-07-16  1:10 UTC (permalink / raw)
  To: Maxime Coquelin, Ma, WenwuX, dev; +Cc: Xia, Chenbo, Jiang, Cheng1, Wang, YuanX

Hi, Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Thursday, July 15, 2021 9:18 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> dev@dpdk.org
> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split ring
> 
> 
> 
> On 7/14/21 8:50 AM, Hu, Jiayu wrote:
> > Hi Maxime,
> >
> > Thanks for your comments. Applies are inline.
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Tuesday, July 13, 2021 10:30 PM
> >> To: Ma, WenwuX <wenwux.ma@intel.com>; dev@dpdk.org
> >> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
> >> <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Wang, YuanX
> >> <yuanx.wang@intel.com>
> >> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split
> >> ring
> >>>  struct async_inflight_info {
> >>>  	struct rte_mbuf *mbuf;
> >>> -	uint16_t descs; /* num of descs inflight */
> >>> +	union {
> >>> +		uint16_t descs; /* num of descs in-flight */
> >>> +		struct async_nethdr nethdr;
> >>> +	};
> >>>  	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
> >>> -};
> >>> +} __rte_cache_aligned;
> >>
> >> Does it really need to be cache aligned?
> >
> > How about changing to 32-byte align? So a cacheline can hold 2 objects.
> 
> Or not forcing any alignment at all? Would there really be a performance
> regression?
> 
> >>
> >>>
> >>>  /**
> >>>   *  dma channel feature bit definition @@ -193,4 +201,34 @@
> >>> __rte_experimental  uint16_t rte_vhost_poll_enqueue_completed(int
> >>> vid, uint16_t queue_id,
> >>>  		struct rte_mbuf **pkts, uint16_t count);
> >>>
> >>> +/**
> >>> + * This function tries to receive packets from the guest with
> >>> +offloading
> >>> + * large copies to the DMA engine. Successfully dequeued packets
> >>> +are
> >>> + * transfer completed, either by the CPU or the DMA engine, and
> >>> +they are
> >>> + * returned in "pkts". There may be other packets that are sent
> >>> +from
> >>> + * the guest but being transferred by the DMA engine, called
> >>> +in-flight
> >>> + * packets. The amount of in-flight packets by now is returned in
> >>> + * "nr_inflight". This function will return in-flight packets only
> >>> +after
> >>> + * the DMA engine finishes transferring.
> >>
> >> I am not sure to understand that comment. Is it still "in-flight" if
> >> the DMA transfer is completed?
> >
> > "in-flight" means packet copies are submitted to the DMA, but the DMA
> > hasn't completed copies.
> >
> >>
> >> Are we ensuring packets are not reordered with this way of working?
> >
> > There is a threshold can be set by users. If set it to 0, which
> > presents all packet copies assigned to the DMA, the packets sent from
> > the guest will not be reordered.
> 
> Reordering packets is bad in my opinion. We cannot expect the user to know
> that he should set the threshold to zero to have packets ordered.
> 
> Maybe we should consider not having threshold, and so have every
> descriptors handled either by the CPU (sync datapath) or by the DMA (async
> datapath). Doing so would simplify a lot the code, and would make
> performance/latency more predictable.
> 
> I understand that we might not get the best performance for every packet
> size doing that, but that may be a tradeoff we would make to have the
> feature maintainable and easily useable by the user.

I understand and agree in some way. But before changing the existed design
in async enqueue and dequeue, we need more careful tests, as current design
is well validated and performance looks good. So I suggest to do it in 21.11.

Thanks,
Jiayu


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-16  1:10           ` Hu, Jiayu
@ 2021-07-16  7:45             ` Maxime Coquelin
  2021-07-16  7:55               ` Hu, Jiayu
  0 siblings, 1 reply; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-16  7:45 UTC (permalink / raw)
  To: Hu, Jiayu, Ma, WenwuX, dev; +Cc: Xia, Chenbo, Jiang, Cheng1, Wang, YuanX

Hi,

On 7/16/21 3:10 AM, Hu, Jiayu wrote:
> Hi, Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Thursday, July 15, 2021 9:18 PM
>> To: Hu, Jiayu <jiayu.hu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
>> dev@dpdk.org
>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
>> <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split ring
>>
>>
>>
>> On 7/14/21 8:50 AM, Hu, Jiayu wrote:
>>> Hi Maxime,
>>>
>>> Thanks for your comments. Applies are inline.
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Tuesday, July 13, 2021 10:30 PM
>>>> To: Ma, WenwuX <wenwux.ma@intel.com>; dev@dpdk.org
>>>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
>>>> <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Wang, YuanX
>>>> <yuanx.wang@intel.com>
>>>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split
>>>> ring
>>>>>  struct async_inflight_info {
>>>>>  	struct rte_mbuf *mbuf;
>>>>> -	uint16_t descs; /* num of descs inflight */
>>>>> +	union {
>>>>> +		uint16_t descs; /* num of descs in-flight */
>>>>> +		struct async_nethdr nethdr;
>>>>> +	};
>>>>>  	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
>>>>> -};
>>>>> +} __rte_cache_aligned;
>>>>
>>>> Does it really need to be cache aligned?
>>>
>>> How about changing to 32-byte align? So a cacheline can hold 2 objects.
>>
>> Or not forcing any alignment at all? Would there really be a performance
>> regression?
>>
>>>>
>>>>>
>>>>>  /**
>>>>>   *  dma channel feature bit definition @@ -193,4 +201,34 @@
>>>>> __rte_experimental  uint16_t rte_vhost_poll_enqueue_completed(int
>>>>> vid, uint16_t queue_id,
>>>>>  		struct rte_mbuf **pkts, uint16_t count);
>>>>>
>>>>> +/**
>>>>> + * This function tries to receive packets from the guest with
>>>>> +offloading
>>>>> + * large copies to the DMA engine. Successfully dequeued packets
>>>>> +are
>>>>> + * transfer completed, either by the CPU or the DMA engine, and
>>>>> +they are
>>>>> + * returned in "pkts". There may be other packets that are sent
>>>>> +from
>>>>> + * the guest but being transferred by the DMA engine, called
>>>>> +in-flight
>>>>> + * packets. The amount of in-flight packets by now is returned in
>>>>> + * "nr_inflight". This function will return in-flight packets only
>>>>> +after
>>>>> + * the DMA engine finishes transferring.
>>>>
>>>> I am not sure to understand that comment. Is it still "in-flight" if
>>>> the DMA transfer is completed?
>>>
>>> "in-flight" means packet copies are submitted to the DMA, but the DMA
>>> hasn't completed copies.
>>>
>>>>
>>>> Are we ensuring packets are not reordered with this way of working?
>>>
>>> There is a threshold can be set by users. If set it to 0, which
>>> presents all packet copies assigned to the DMA, the packets sent from
>>> the guest will not be reordered.
>>
>> Reordering packets is bad in my opinion. We cannot expect the user to know
>> that he should set the threshold to zero to have packets ordered.
>>
>> Maybe we should consider not having threshold, and so have every
>> descriptors handled either by the CPU (sync datapath) or by the DMA (async
>> datapath). Doing so would simplify a lot the code, and would make
>> performance/latency more predictable.
>>
>> I understand that we might not get the best performance for every packet
>> size doing that, but that may be a tradeoff we would make to have the
>> feature maintainable and easily useable by the user.
> 
> I understand and agree in some way. But before changing the existed design
> in async enqueue and dequeue, we need more careful tests, as current design
> is well validated and performance looks good. So I suggest to do it in 21.11.

My understanding was that for enqueue path packets were not reordered,
thinking the used ring was written in order, but it seems I was wrong.

What kind of validation and performance testing has been done? I can
imagine reordering to have a bad impact on L4+ benchmarks.

Let's first fix this for enqueue path, then submit new revision for
dequeue path without packet reordering.

Regards,
Maxime

> Thanks,
> Jiayu
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-16  7:45             ` Maxime Coquelin
@ 2021-07-16  7:55               ` Hu, Jiayu
  2021-07-16  9:02                 ` Maxime Coquelin
  0 siblings, 1 reply; 50+ messages in thread
From: Hu, Jiayu @ 2021-07-16  7:55 UTC (permalink / raw)
  To: Maxime Coquelin, Ma, WenwuX, dev; +Cc: Xia, Chenbo, Jiang, Cheng1, Wang, YuanX



> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Friday, July 16, 2021 3:46 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> dev@dpdk.org
> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split ring
> 
> Hi,
> 
> On 7/16/21 3:10 AM, Hu, Jiayu wrote:
> > Hi, Maxime,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Thursday, July 15, 2021 9:18 PM
> >> To: Hu, Jiayu <jiayu.hu@intel.com>; Ma, WenwuX
> <wenwux.ma@intel.com>;
> >> dev@dpdk.org
> >> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
> >> <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
> >> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split
> >> ring
> >>
> >>
> >>
> >> On 7/14/21 8:50 AM, Hu, Jiayu wrote:
> >>> Hi Maxime,
> >>>
> >>> Thanks for your comments. Applies are inline.
> >>>
> >>>> -----Original Message-----
> >>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>> Sent: Tuesday, July 13, 2021 10:30 PM
> >>>> To: Ma, WenwuX <wenwux.ma@intel.com>; dev@dpdk.org
> >>>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
> >>>> <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Wang,
> >>>> YuanX <yuanx.wang@intel.com>
> >>>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split
> >>>> ring
> >>>>>  struct async_inflight_info {
> >>>>>  	struct rte_mbuf *mbuf;
> >>>>> -	uint16_t descs; /* num of descs inflight */
> >>>>> +	union {
> >>>>> +		uint16_t descs; /* num of descs in-flight */
> >>>>> +		struct async_nethdr nethdr;
> >>>>> +	};
> >>>>>  	uint16_t nr_buffers; /* num of buffers inflight for packed ring
> >>>>> */ -};
> >>>>> +} __rte_cache_aligned;
> >>>>
> >>>> Does it really need to be cache aligned?
> >>>
> >>> How about changing to 32-byte align? So a cacheline can hold 2 objects.
> >>
> >> Or not forcing any alignment at all? Would there really be a
> >> performance regression?
> >>
> >>>>
> >>>>>
> >>>>>  /**
> >>>>>   *  dma channel feature bit definition @@ -193,4 +201,34 @@
> >>>>> __rte_experimental  uint16_t rte_vhost_poll_enqueue_completed(int
> >>>>> vid, uint16_t queue_id,
> >>>>>  		struct rte_mbuf **pkts, uint16_t count);
> >>>>>
> >>>>> +/**
> >>>>> + * This function tries to receive packets from the guest with
> >>>>> +offloading
> >>>>> + * large copies to the DMA engine. Successfully dequeued packets
> >>>>> +are
> >>>>> + * transfer completed, either by the CPU or the DMA engine, and
> >>>>> +they are
> >>>>> + * returned in "pkts". There may be other packets that are sent
> >>>>> +from
> >>>>> + * the guest but being transferred by the DMA engine, called
> >>>>> +in-flight
> >>>>> + * packets. The amount of in-flight packets by now is returned in
> >>>>> + * "nr_inflight". This function will return in-flight packets
> >>>>> +only after
> >>>>> + * the DMA engine finishes transferring.
> >>>>
> >>>> I am not sure to understand that comment. Is it still "in-flight"
> >>>> if the DMA transfer is completed?
> >>>
> >>> "in-flight" means packet copies are submitted to the DMA, but the
> >>> DMA hasn't completed copies.
> >>>
> >>>>
> >>>> Are we ensuring packets are not reordered with this way of working?
> >>>
> >>> There is a threshold can be set by users. If set it to 0, which
> >>> presents all packet copies assigned to the DMA, the packets sent
> >>> from the guest will not be reordered.
> >>
> >> Reordering packets is bad in my opinion. We cannot expect the user to
> >> know that he should set the threshold to zero to have packets ordered.
> >>
> >> Maybe we should consider not having threshold, and so have every
> >> descriptors handled either by the CPU (sync datapath) or by the DMA
> >> (async datapath). Doing so would simplify a lot the code, and would
> >> make performance/latency more predictable.
> >>
> >> I understand that we might not get the best performance for every
> >> packet size doing that, but that may be a tradeoff we would make to
> >> have the feature maintainable and easily useable by the user.
> >
> > I understand and agree in some way. But before changing the existed
> > design in async enqueue and dequeue, we need more careful tests, as
> > current design is well validated and performance looks good. So I suggest
> to do it in 21.11.
> 
> My understanding was that for enqueue path packets were not reordered,
> thinking the used ring was written in order, but it seems I was wrong.
> 
> What kind of validation and performance testing has been done? I can
> imagine reordering to have a bad impact on L4+ benchmarks.

Iperf and scp in V2V scenarios.

One thing to notice is that if we guarantee in-order, small packets will be blocked
by large packets, especially for control packets in TCP, which significantly increases
latency. In iperf tests, it will impact connection setup and increase latency. Current
design doesn't show big impacts on iperf and scp tests, but I am not sure about more
complex networking scenarios.

> 
> Let's first fix this for enqueue path, then submit new revision for dequeue
> path without packet reordering.

Sure. The way to fix it needs to be very careful, IMO. So I'd suggest more tests
before any modification.

Thanks,
Jiayu
> 
> Regards,
> Maxime
> 
> > Thanks,
> > Jiayu
> >


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-14  6:50       ` Hu, Jiayu
  2021-07-15 13:18         ` Maxime Coquelin
@ 2021-07-16  8:14         ` David Marchand
  2021-07-16 13:45           ` Hu, Jiayu
  1 sibling, 1 reply; 50+ messages in thread
From: David Marchand @ 2021-07-16  8:14 UTC (permalink / raw)
  To: Hu, Jiayu
  Cc: Maxime Coquelin, Ma, WenwuX, dev, Xia, Chenbo, Jiang, Cheng1,
	Wang, YuanX

On Wed, Jul 14, 2021 at 8:50 AM Hu, Jiayu <jiayu.hu@intel.com> wrote:
> > Are we ensuring packets are not reordered with this way of working?
>
> There is a threshold can be set by users. If set it to 0, which presents all
> packet copies assigned to the DMA, the packets sent from the guest will
> not be reordered.

- I find the rte_vhost_async_channel_register() signature with a
bitfield quite ugly.
We are writing sw, this is not mapped to hw stuff... but ok this is a
different topic.


- I don't like this threshold, this is too low level and most users
will only see the shiny aspect "better performance" without
understanding the consequences.
By default, it leaves the door open to a _bad_ behavior, that is
packet reordering.
At a very minimum, strongly recommend to use 0 in the API.



-- 
David Marchand


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-16  7:55               ` Hu, Jiayu
@ 2021-07-16  9:02                 ` Maxime Coquelin
  0 siblings, 0 replies; 50+ messages in thread
From: Maxime Coquelin @ 2021-07-16  9:02 UTC (permalink / raw)
  To: Hu, Jiayu, Ma, WenwuX, dev; +Cc: Xia, Chenbo, Jiang, Cheng1, Wang, YuanX



On 7/16/21 9:55 AM, Hu, Jiayu wrote:
> 
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Friday, July 16, 2021 3:46 PM
>> To: Hu, Jiayu <jiayu.hu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
>> dev@dpdk.org
>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
>> <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split ring
>>
>> Hi,
>>
>> On 7/16/21 3:10 AM, Hu, Jiayu wrote:
>>> Hi, Maxime,
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Thursday, July 15, 2021 9:18 PM
>>>> To: Hu, Jiayu <jiayu.hu@intel.com>; Ma, WenwuX
>> <wenwux.ma@intel.com>;
>>>> dev@dpdk.org
>>>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
>>>> <cheng1.jiang@intel.com>; Wang, YuanX <yuanx.wang@intel.com>
>>>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split
>>>> ring
>>>>
>>>>
>>>>
>>>> On 7/14/21 8:50 AM, Hu, Jiayu wrote:
>>>>> Hi Maxime,
>>>>>
>>>>> Thanks for your comments. Applies are inline.
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>>> Sent: Tuesday, July 13, 2021 10:30 PM
>>>>>> To: Ma, WenwuX <wenwux.ma@intel.com>; dev@dpdk.org
>>>>>> Cc: Xia, Chenbo <chenbo.xia@intel.com>; Jiang, Cheng1
>>>>>> <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Wang,
>>>>>> YuanX <yuanx.wang@intel.com>
>>>>>> Subject: Re: [PATCH v5 3/4] vhost: support async dequeue for split
>>>>>> ring
>>>>>>>  struct async_inflight_info {
>>>>>>>  	struct rte_mbuf *mbuf;
>>>>>>> -	uint16_t descs; /* num of descs inflight */
>>>>>>> +	union {
>>>>>>> +		uint16_t descs; /* num of descs in-flight */
>>>>>>> +		struct async_nethdr nethdr;
>>>>>>> +	};
>>>>>>>  	uint16_t nr_buffers; /* num of buffers inflight for packed ring
>>>>>>> */ -};
>>>>>>> +} __rte_cache_aligned;
>>>>>>
>>>>>> Does it really need to be cache aligned?
>>>>>
>>>>> How about changing to 32-byte align? So a cacheline can hold 2 objects.
>>>>
>>>> Or not forcing any alignment at all? Would there really be a
>>>> performance regression?
>>>>
>>>>>>
>>>>>>>
>>>>>>>  /**
>>>>>>>   *  dma channel feature bit definition @@ -193,4 +201,34 @@
>>>>>>> __rte_experimental  uint16_t rte_vhost_poll_enqueue_completed(int
>>>>>>> vid, uint16_t queue_id,
>>>>>>>  		struct rte_mbuf **pkts, uint16_t count);
>>>>>>>
>>>>>>> +/**
>>>>>>> + * This function tries to receive packets from the guest with
>>>>>>> +offloading
>>>>>>> + * large copies to the DMA engine. Successfully dequeued packets
>>>>>>> +are
>>>>>>> + * transfer completed, either by the CPU or the DMA engine, and
>>>>>>> +they are
>>>>>>> + * returned in "pkts". There may be other packets that are sent
>>>>>>> +from
>>>>>>> + * the guest but being transferred by the DMA engine, called
>>>>>>> +in-flight
>>>>>>> + * packets. The amount of in-flight packets by now is returned in
>>>>>>> + * "nr_inflight". This function will return in-flight packets
>>>>>>> +only after
>>>>>>> + * the DMA engine finishes transferring.
>>>>>>
>>>>>> I am not sure to understand that comment. Is it still "in-flight"
>>>>>> if the DMA transfer is completed?
>>>>>
>>>>> "in-flight" means packet copies are submitted to the DMA, but the
>>>>> DMA hasn't completed copies.
>>>>>
>>>>>>
>>>>>> Are we ensuring packets are not reordered with this way of working?
>>>>>
>>>>> There is a threshold can be set by users. If set it to 0, which
>>>>> presents all packet copies assigned to the DMA, the packets sent
>>>>> from the guest will not be reordered.
>>>>
>>>> Reordering packets is bad in my opinion. We cannot expect the user to
>>>> know that he should set the threshold to zero to have packets ordered.
>>>>
>>>> Maybe we should consider not having threshold, and so have every
>>>> descriptors handled either by the CPU (sync datapath) or by the DMA
>>>> (async datapath). Doing so would simplify a lot the code, and would
>>>> make performance/latency more predictable.
>>>>
>>>> I understand that we might not get the best performance for every
>>>> packet size doing that, but that may be a tradeoff we would make to
>>>> have the feature maintainable and easily useable by the user.
>>>
>>> I understand and agree in some way. But before changing the existed
>>> design in async enqueue and dequeue, we need more careful tests, as
>>> current design is well validated and performance looks good. So I suggest
>> to do it in 21.11.
>>
>> My understanding was that for enqueue path packets were not reordered,
>> thinking the used ring was written in order, but it seems I was wrong.
>>
>> What kind of validation and performance testing has been done? I can
>> imagine reordering to have a bad impact on L4+ benchmarks.
> 
> Iperf and scp in V2V scenarios.
> 
> One thing to notice is that if we guarantee in-order, small packets will be blocked
> by large packets, especially for control packets in TCP, which significantly increases
> latency. In iperf tests, it will impact connection setup and increase latency. Current
> design doesn't show big impacts on iperf and scp tests, but I am not sure about more
> complex networking scenarios.
> 

Ok, I see. I guess that depending on the payload size, one can see perf
improvement if all the data segments are larger than the threshold. Or
it could cause perf penalty if last segments arrives before the previous
ones.

>>
>> Let's first fix this for enqueue path, then submit new revision for dequeue
>> path without packet reordering.
> 
> Sure. The way to fix it needs to be very careful, IMO. So I'd suggest more tests
> before any modification.
> 
> Thanks,
> Jiayu
>>
>> Regards,
>> Maxime
>>
>>> Thanks,
>>> Jiayu
>>>
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-16  8:14         ` David Marchand
@ 2021-07-16 13:45           ` Hu, Jiayu
  2021-07-16 13:52             ` David Marchand
  0 siblings, 1 reply; 50+ messages in thread
From: Hu, Jiayu @ 2021-07-16 13:45 UTC (permalink / raw)
  To: David Marchand
  Cc: Maxime Coquelin, Ma, WenwuX, dev, Xia, Chenbo, Jiang, Cheng1,
	Wang, YuanX



> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Friday, July 16, 2021 4:15 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for
> split ring
> 
> On Wed, Jul 14, 2021 at 8:50 AM Hu, Jiayu <jiayu.hu@intel.com> wrote:
> > > Are we ensuring packets are not reordered with this way of working?
> >
> > There is a threshold can be set by users. If set it to 0, which
> > presents all packet copies assigned to the DMA, the packets sent from
> > the guest will not be reordered.
> 
> - I find the rte_vhost_async_channel_register() signature with a bitfield quite
> ugly.
> We are writing sw, this is not mapped to hw stuff... but ok this is a different
> topic.

I have reworked the structure. Here is the link:
http://patches.dpdk.org/project/dpdk/patch/1626465089-17052-3-git-send-email-jiayu.hu@intel.com/

> 
> 
> - I don't like this threshold, this is too low level and most users will only see
> the shiny aspect "better performance" without understanding the
> consequences.
> By default, it leaves the door open to a _bad_ behavior, that is packet
> reordering.
> At a very minimum, strongly recommend to use 0 in the API.

That's a good point. But there are some reasons of open this value to users:
- large packets will block small packets, like control packets of TCP.
- dma efficiency. We usually see 20~30% drops because of offloading 64B copies to
dma engine.
- the threshold is not only related to hardware, but also application. The value decides
which copies are assigned to which worker, the CPU or the DMA. As async vhost works
in an asynchronous way, the threshold value decides how many works can be done in
parallel. It's not only about what DMA engine and what platform we use, but also what
computation the CPU has been assigned. Different users will have different values.

I totally understand the worry about reordering. But simple iperf tests show positive
results with setting threshold in our lab. We need more careful tests before modifying
it, IMHO.

Thanks,
Jiayu
> 
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-16 13:45           ` Hu, Jiayu
@ 2021-07-16 13:52             ` David Marchand
  2021-07-16 14:00               ` Hu, Jiayu
  0 siblings, 1 reply; 50+ messages in thread
From: David Marchand @ 2021-07-16 13:52 UTC (permalink / raw)
  To: Hu, Jiayu
  Cc: Maxime Coquelin, Ma, WenwuX, dev, Xia, Chenbo, Jiang, Cheng1,
	Wang, YuanX

On Fri, Jul 16, 2021 at 3:45 PM Hu, Jiayu <jiayu.hu@intel.com> wrote:
> > - I don't like this threshold, this is too low level and most users will only see
> > the shiny aspect "better performance" without understanding the
> > consequences.
> > By default, it leaves the door open to a _bad_ behavior, that is packet
> > reordering.
> > At a very minimum, strongly recommend to use 0 in the API.
>
> That's a good point. But there are some reasons of open this value to users:
> - large packets will block small packets, like control packets of TCP.
> - dma efficiency. We usually see 20~30% drops because of offloading 64B copies to
> dma engine.
> - the threshold is not only related to hardware, but also application. The value decides
> which copies are assigned to which worker, the CPU or the DMA. As async vhost works
> in an asynchronous way, the threshold value decides how many works can be done in
> parallel. It's not only about what DMA engine and what platform we use, but also what
> computation the CPU has been assigned. Different users will have different values.
>
> I totally understand the worry about reordering. But simple iperf tests show positive
> results with setting threshold in our lab. We need more careful tests before modifying
> it, IMHO.

If you need more time, then please take it.
The dma generic API could have an impact on this feature too.

Why the rush for merging this now?


-- 
David Marchand


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring
  2021-07-16 13:52             ` David Marchand
@ 2021-07-16 14:00               ` Hu, Jiayu
  0 siblings, 0 replies; 50+ messages in thread
From: Hu, Jiayu @ 2021-07-16 14:00 UTC (permalink / raw)
  To: David Marchand
  Cc: Maxime Coquelin, Ma, WenwuX, dev, Xia, Chenbo, Jiang, Cheng1,
	Wang, YuanX



> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Friday, July 16, 2021 9:53 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for
> split ring
> 
> On Fri, Jul 16, 2021 at 3:45 PM Hu, Jiayu <jiayu.hu@intel.com> wrote:
> > > - I don't like this threshold, this is too low level and most users
> > > will only see the shiny aspect "better performance" without
> > > understanding the consequences.
> > > By default, it leaves the door open to a _bad_ behavior, that is
> > > packet reordering.
> > > At a very minimum, strongly recommend to use 0 in the API.
> >
> > That's a good point. But there are some reasons of open this value to users:
> > - large packets will block small packets, like control packets of TCP.
> > - dma efficiency. We usually see 20~30% drops because of offloading
> > 64B copies to dma engine.
> > - the threshold is not only related to hardware, but also application.
> > The value decides which copies are assigned to which worker, the CPU
> > or the DMA. As async vhost works in an asynchronous way, the threshold
> > value decides how many works can be done in parallel. It's not only
> > about what DMA engine and what platform we use, but also what
> computation the CPU has been assigned. Different users will have different
> values.
> >
> > I totally understand the worry about reordering. But simple iperf
> > tests show positive results with setting threshold in our lab. We need
> > more careful tests before modifying it, IMHO.
> 
> If you need more time, then please take it.
> The dma generic API could have an impact on this feature too.
> 
> Why the rush for merging this now?

Changing threshold or not is under discussion. No data shows setting
it to 0 is better than current design. On the contrary, our intensive lab
data shows positive results with this design. Please don't mix it with this
deq feature. Especially for now, app can set it to 0, if reordering really
matters a lot.
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
                   ` (4 preceding siblings ...)
  2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-07-16 19:18 ` Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
                     ` (3 more replies)
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
  6 siblings, 4 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-16 19:18 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with offloading
large copies to the DMA engine, thus saving precious CPU cycles.

v6:
- Allocate and free packets in bulk.
- struct async_inflight_info aligned to 32 bits.
- Change ASYNC_RX_VHOST to ASYNV_ENQUEUE_VHOST and
  ASYNC_TX_VHOST to ASYNV_DEQUEUE_VHOST.

v5:
- DMA address use IOVA instead of VA.

v4:
- Fix wrong packet index issue in async dequeue improve
  the performance of small packet copies.

v3:
- Fix compilation warning and error in arm platform.
- Restore the removed function virtio_dev_pktmbuf_alloc,
  async dequeue allocate packets in separate.

v2:
- Refactor vhost datapath as preliminary patch for this series.
- The change of using new API in examples/vhost is put into a
  dedicated patch.
- Check queue_id value before using it.
- Async dequeue performance enhancement. 160% performance improvement
  for v2 vs. v1.
- Async dequeue API name change from rte_vhost_try_dequeue_burst to
  rte_vhost_async_try_dequeue_burst.
- The completed package updates the used ring directly.

Wenwu Ma (3):
  examples/vhost: refactor vhost enqueue and dequeue datapaths
  examples/vhost: use a new API to query remaining ring space
  examples/vhost: support vhost async dequeue data path

Yuan Wang (1):
  vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |   9 +
 doc/guides/sample_app_ug/vhost.rst  |   9 +-
 examples/vhost/ioat.c               |  67 +++-
 examples/vhost/ioat.h               |  25 ++
 examples/vhost/main.c               | 224 +++++++----
 examples/vhost/main.h               |  33 +-
 examples/vhost/virtio_net.c         |  16 +-
 lib/vhost/rte_vhost_async.h         |  39 +-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 586 ++++++++++++++++++++++++++++
 10 files changed, 904 insertions(+), 107 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths
  2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-07-16 19:18   ` Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-16 19:18 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

Previously, by judging the flag, we call different enqueue/dequeue
functions in data path.

Now, we use an ops that was initialized when Vhost was created,
so that we can call ops directly in Vhost data path without any more
flag judgment.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
 examples/vhost/main.h       |  33 +++++++++--
 examples/vhost/virtio_net.c |  16 +++++-
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..aebdc3a566 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
 static char *socket_files;
 static int nb_sockets;
 
+static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE];
+
 /* empty vmdq configuration structure. Filled in programatically */
 static struct rte_eth_conf vmdq_conf_default = {
 	.rxmode = {
@@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev)
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
 
-	if (builtin_net_driver) {
-		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[nr_xmit];
-
-		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
-
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = nr_xmit - ret;
-		if (enqueue_fail)
-			free_pkts(&m[ret], nr_xmit - ret);
-	} else {
-		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						m, nr_xmit);
-	}
+	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+					VIRTIO_RXQ, m, nr_xmit);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
@@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 }
 
+uint16_t
+async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	uint16_t enqueue_count;
+	uint32_t cpu_cpl_nr = 0;
+	uint16_t enqueue_fail = 0;
+	struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
+
+	complete_async_pkts(vdev);
+	enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
+				queue_id, pkts, rx_count,
+				m_cpu_cpl, &cpu_cpl_nr);
+	if (cpu_cpl_nr)
+		free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+	enqueue_fail = rx_count - enqueue_count;
+	if (enqueue_fail)
+		free_pkts(&pkts[enqueue_count], enqueue_fail);
+
+	return enqueue_count;
+}
+
+uint16_t
+sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count);
+}
+
 static __rte_always_inline void
 drain_eth_rx(struct vhost_dev *vdev)
 {
@@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev)
 		}
 	}
 
-	if (builtin_net_driver) {
-		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
-						pkts, rx_count);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
-
-		complete_async_pkts(vdev);
-		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count,
-					m_cpu_cpl, &cpu_cpl_nr);
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = rx_count - enqueue_count;
-		if (enqueue_fail)
-			free_pkts(&pkts[enqueue_count], enqueue_fail);
-
-	} else {
-		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
-	}
+	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+						VIRTIO_RXQ, pkts, rx_count);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
@@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count)
+{
+	return rte_vhost_dequeue_burst(dev->vid, queue_id,
+					mbuf_pool, pkts, count);
+}
+
 static __rte_always_inline void
 drain_virtio_tx(struct vhost_dev *vdev)
 {
@@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	if (builtin_net_driver) {
-		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
-					pkts, MAX_PKT_BURST);
-	} else {
-		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
-	}
+	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
+				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
@@ -1436,6 +1431,21 @@ new_device(int vid)
 		}
 	}
 
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (async_vhost_driver) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							sync_enqueue_pkts;
+		}
+
+		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
+	}
+
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
 
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 0ccdce4b4a..7cd8a11a45 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -60,6 +60,19 @@ struct vhost_dev {
 	struct vhost_queue queues[MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;
 
+typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mbuf **pkts,
+			uint32_t count);
+
+typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+
+struct vhost_queue_ops {
+	vhost_enqueue_burst_t enqueue_pkt_burst;
+	vhost_dequeue_burst_t dequeue_pkt_burst;
+};
+
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
 
 
@@ -84,9 +97,21 @@ struct lcore_info {
 void vs_vhost_net_setup(struct vhost_dev *dev);
 void vs_vhost_net_remove(struct vhost_dev *dev);
 uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+
+uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			 struct rte_mbuf **pkts, uint32_t count);
-
-uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
-			 struct rte_mempool *mbuf_pool,
-			 struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			 struct rte_mbuf **pkts, uint32_t count);
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c
index 9064fc3a82..2432a96566 100644
--- a/examples/vhost/virtio_net.c
+++ b/examples/vhost/virtio_net.c
@@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	return count;
 }
 
+uint16_t
+builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t count)
+{
+	return vs_enqueue_pkts(dev, queue_id, pkts, count);
+}
+
 static __rte_always_inline int
 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	    struct rte_mbuf *m, uint16_t desc_idx,
@@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	return 0;
 }
 
-uint16_t
+static uint16_t
 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
@@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 
 	return i;
 }
+
+uint16_t
+builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+	return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] examples/vhost: use a new API to query remaining ring space
  2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-07-16 19:18   ` Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 3/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-16 19:18 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

A new API for querying the remaining descriptor ring capacity
is available, so we use the new one instead of the old one.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 examples/vhost/ioat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..bf4e033bdb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +112,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] vhost: support async dequeue for split ring
  2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-07-16 19:18   ` Wenwu Ma
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-16 19:18 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Yuan Wang, Wenwu Ma

From: Yuan Wang <yuanx.wang@intel.com>

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the async channel, thus saving precious CPU
cycles.

Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |   9 +
 lib/vhost/rte_vhost_async.h         |  39 +-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 586 ++++++++++++++++++++++++++++
 4 files changed, 635 insertions(+), 2 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..bf90a2663b 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,15 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  This function tries to receive packets from the guest with offloading
+  large copies to the async channel. The packets that are transfer completed
+  are returned in ``pkts``. The other packets that their copies are submitted
+  to the async channel but not completed are called "in-flight packets".
+  This function will not return in-flight packets until their copies are
+  completed by the async channel.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..04d7588217 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -83,12 +83,20 @@ struct rte_vhost_async_channel_ops {
 		uint16_t max_packets);
 };
 
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
@@ -193,4 +201,31 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the async channel. The packets that are transfer completed
+ * are returned in "pkts". The other packets that their copies are submitted to
+ * the async channel but not completed are called "in-flight packets".
+ * This function will not return in-flight packets until their copies are
+ * completed by the async channel.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets. If error occurred, its value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..a320f889cd 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_async_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b93482587c..58317d7b75 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -3147,3 +3147,589 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr, buf_iova;
+	uint64_t mapped_len;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+	buf_iova = buf_vec[vec_idx].buf_iova;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		while (cpy_len && cpy_len >= cpy_threshold) {
+			void *hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+						buf_iova + buf_offset, cpy_len,
+						&mapped_len);
+
+			if (unlikely(!hpa || mapped_len < cpy_threshold))
+				break;
+
+			async_fill_vec(src_iovec + tvec_idx, hpa,
+				(size_t)mapped_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				(void *)(uintptr_t)rte_pktmbuf_iova_offset(cur,
+							mbuf_offset),
+				(size_t)mapped_len);
+
+			tvec_idx++;
+			tlen += (uint32_t)mapped_len;
+			cpy_len -= (uint32_t)mapped_len;
+			mbuf_avail -= (uint32_t)mapped_len;
+			mbuf_offset += (uint32_t)mapped_len;
+			buf_avail -= (uint32_t)mapped_len;
+			buf_offset += (uint32_t)mapped_len;
+		}
+
+		if (cpy_len) {
+			if (vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m)) {
+				rte_memcpy(
+					rte_pktmbuf_mtod_offset(cur, void *,
+							mbuf_offset),
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset)),
+					cpy_len);
+			} else {
+				batch_copy[vq->batch_copy_nb_elems].dst =
+					rte_pktmbuf_mtod_offset(cur, void *,
+							mbuf_offset);
+				batch_copy[vq->batch_copy_nb_elems].src =
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset));
+				batch_copy[vq->batch_copy_nb_elems].len =
+					cpy_len;
+				vq->batch_copy_nb_elems++;
+			}
+
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail -= cpy_len;
+			buf_offset += cpy_len;
+		}
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put) {
+		/* write back completed descs to used ring */
+		write_back_completed_descs_split(vq, n_pkts_put);
+		/* update used ring */
+		__atomic_add_fetch(&vq->used->idx,
+				n_pkts_put, __ATOMIC_RELEASE);
+
+		vq->async_pkts_inflight_n -= n_pkts_put;
+	}
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	bool dropped = false;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST];
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count))
+		goto out;
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		err = virtio_dev_pktmbuf_prep(dev, pkts_prealloc[pkt_idx],
+						buf_len);
+		if (unlikely(err)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			dropped = true;
+			break;
+		}
+
+		pkt = pkts_prealloc[pkt_idx];
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			dropped = true;
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (unlikely(dropped))
+		rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx);
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+	if (likely(vq->shadow_used_idx))
+		flush_shadow_used_ring_split(dev, vq);
+
+out:
+	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
+		nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
+					queue_id, &pkts[nr_done_pkts],
+					count - nr_done_pkts,
+					legacy_ol_flags);
+		nr_done_pkts += nr_async_cmpl_pkts;
+	}
+	if (likely(nr_done_pkts))
+		vhost_vring_call_split(dev, vq);
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] examples/vhost: support vhost async dequeue data path
  2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
                     ` (2 preceding siblings ...)
  2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-07-16 19:18   ` Wenwu Ma
  3 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-16 19:18 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch is to add vhost async dequeue data-path in vhost sample.
vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |   9 +-
 examples/vhost/ioat.c              |  61 ++++++++++---
 examples/vhost/ioat.h              |  25 ++++++
 examples/vhost/main.c              | 140 ++++++++++++++++++++---------
 4 files changed, 177 insertions(+), 58 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 9afde9c7f5..63dcf181e1 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
 **--dmas**
 This parameter is used to specify the assigned DMA device of a vhost device.
 Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
-device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
-enqueue operation.
+--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
+and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
+operation. The index of the device corresponds to the socket file in order,
+that means vhost device 0 is created through the first socket file, vhost
+device 1 is created through the second socket file, and so on.
 
 Common Issues
 -------------
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index bf4e033bdb..8bd379d084 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -21,6 +21,8 @@ struct packet_tracker {
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
 
+int vid2socketid[MAX_VHOST_DEVICE];
+
 int
 open_ioat(const char *value)
 {
@@ -29,7 +31,7 @@ open_ioat(const char *value)
 	char *addrs = input;
 	char *ptrs[2];
 	char *start, *end, *substr;
-	int64_t vid, vring_id;
+	int64_t socketid, vring_id;
 	struct rte_ioat_rawdev_config config;
 	struct rte_rawdev_info info = { .dev_private = &config };
 	char name[32];
@@ -60,6 +62,7 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
@@ -68,27 +71,39 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		int async_flag;
+		char *txd, *rxd;
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd) {
+			is_txd = true;
+			start = txd;
+			async_flag = ASYNC_ENQUEUE_VHOST;
+		} else if (rxd) {
+			is_txd = false;
+			start = rxd;
+			async_flag = ASYNC_DEQUEUE_VHOST;
+		} else {
 			ret = -1;
 			goto out;
 		}
 
 		start += 3;
-		vid = strtol(start, &end, 0);
+		socketid = strtol(start, &end, 0);
 		if (end == start) {
 			ret = -1;
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
-				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
+			&(dma_info + socketid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
 			goto out;
 		}
 
-		rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr,
+		rte_pci_device_name(&(dma_info + socketid)->dmas[vring_id].addr,
 				name, sizeof(name));
 		dev_id = rte_rawdev_get_dev_id(name);
 		if (dev_id == (uint16_t)(-ENODEV) ||
@@ -103,8 +118,9 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
-		(dma_info + vid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
+		(dma_info + socketid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->async_flag |= async_flag;
 		config.ring_size = IOAT_RING_SIZE;
 		config.hdls_disable = true;
 		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
@@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
 	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
 
+	if (queue_id >= MAX_RING_COUNT)
+		return -1;
+
+	uint16_t dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
+	unsigned short write = cb_tracker[dev_id].next_write;
 	if (!opaque_data) {
 		for (i_desc = 0; i_desc < count; i_desc++) {
 			src = descs[i_desc].src;
@@ -170,16 +189,16 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets)
 {
-	if (!opaque_data) {
+	if (!opaque_data && (queue_id < MAX_RING_COUNT)) {
 		uintptr_t dump[255];
 		int n_seg;
 		unsigned short read, write;
 		unsigned short nb_packet = 0;
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
+		uint16_t dev_id;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
@@ -215,4 +234,18 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 	return -1;
 }
 
+uint32_t get_async_flag_by_vid(int vid)
+{
+	return dma_bind[vid2socketid[vid]].async_flag;
+}
+
+uint32_t get_async_flag_by_socketid(int socketid)
+{
+	return dma_bind[socketid].async_flag;
+}
+
+void init_vid2socketid_array(int vid, int socketid)
+{
+	vid2socketid[vid] = socketid;
+}
 #endif /* RTE_RAW_IOAT */
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..3a85c94c8a 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -12,6 +12,9 @@
 #define MAX_VHOST_DEVICE 1024
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
+#define MAX_RING_COUNT	2
+#define ASYNC_ENQUEUE_VHOST	1
+#define ASYNC_DEQUEUE_VHOST	2
 
 struct dma_info {
 	struct rte_pci_addr addr;
@@ -20,6 +23,7 @@ struct dma_info {
 };
 
 struct dma_for_vhost {
+	int async_flag;
 	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
 	uint16_t nr;
 };
@@ -36,6 +40,10 @@ uint32_t
 ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
+
+uint32_t get_async_flag_by_vid(int vid);
+uint32_t get_async_flag_by_socketid(int socketid);
+void init_vid2socketid_array(int vid, int socketid);
 #else
 static int open_ioat(const char *value __rte_unused)
 {
@@ -59,5 +67,22 @@ ioat_check_completed_copies_cb(int vid __rte_unused,
 {
 	return -1;
 }
+
+static uint32_t
+get_async_flag_by_vid(int vid __rte_unused)
+{
+	return 0;
+}
+
+static uint32_t
+get_async_flag_by_socketid(int socketid __rte_unused)
+{
+	return 0;
+}
+
+static void
+init_vid2socketid_array(int vid __rte_unused, int socketid __rte_unused)
+{
+}
 #endif
 #endif /* _IOAT_H_ */
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index aebdc3a566..314184b447 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -93,8 +93,6 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
-
 static char *dma_type;
 
 /* Specify timeout (in useconds) between retries on RX. */
@@ -679,7 +677,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -897,7 +894,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_ENQUEUE_VHOST) == 0)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1237,10 +1234,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_ENQUEUE_VHOST) == 0)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+				struct rte_mempool *mbuf_pool,
+				struct rte_mbuf **pkts, uint16_t count)
+{
+	int nr_inflight;
+	return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
+			mbuf_pool, pkts, count, &nr_inflight);
+}
+
 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			struct rte_mempool *mbuf_pool,
 			struct rte_mbuf **pkts, uint16_t count)
@@ -1392,12 +1398,90 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
 
+static int
+get_socketid_by_vid(int vid)
+{
+	int i;
+	char ifname[PATH_MAX];
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
+
+	for (i = 0; i < nb_sockets; i++) {
+		char *file = socket_files + i * PATH_MAX;
+		if (strcmp(file, ifname) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int
+init_vhost_queue_ops(int vid)
+{
+	int socketid = get_socketid_by_vid(vid);
+	if (socketid == -1)
+		return -1;
+
+	init_vid2socketid_array(vid, socketid);
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						sync_enqueue_pkts;
+		}
+
+		if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						async_dequeue_pkts;
+		} else {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						sync_dequeue_pkts;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_async_channel_register(int vid)
+{
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
+
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
+
+		f.async_inorder = 1;
+		f.async_threshold = 256;
+
+		if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
+					f.intval, &channel_ops);
+		}
+		if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ,
+					f.intval, &channel_ops);
+		}
+	}
+
+	return ret;
+}
+
 /*
  * A new device is added to a data core. First the device is added to the main linked list
  * and then allocated to a specific data core.
@@ -1431,20 +1515,8 @@ new_device(int vid)
 		}
 	}
 
-	if (builtin_net_driver) {
-		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
-		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
-	} else {
-		if (async_vhost_driver) {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							async_enqueue_pkts;
-		} else {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							sync_enqueue_pkts;
-		}
-
-		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
-	}
+	if (init_vhost_queue_ops(vid) != 0)
+		return -1;
 
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
@@ -1473,28 +1545,13 @@ new_device(int vid)
 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
+	int ret = vhost_async_channel_register(vid);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
-
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			f.async_inorder = 1;
-			f.async_threshold = 256;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
-		}
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1735,10 +1792,11 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
-			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
+		uint64_t flag = flags;
+		if (get_async_flag_by_socketid(i) != 0)
+			flag |= RTE_VHOST_USER_ASYNC_COPY;
 
-		ret = rte_vhost_driver_register(file, flags);
+		ret = rte_vhost_driver_register(file, flag);
 		if (ret != 0) {
 			unregister_drivers(i);
 			rte_exit(EXIT_FAILURE,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-07-21  2:31   ` Wang, Yinan
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 50+ messages in thread
From: Wang, Yinan @ 2021-07-21  2:31 UTC (permalink / raw)
  To: Ma, WenwuX, dev
  Cc: maxime.coquelin, Xia, Chenbo, Jiang, Cheng1, Hu, Jiayu, Ma, WenwuX

Tested-by: Yinan Wang <yinan.wang@intel.com>

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Wenwu Ma
> Sent: 2021?7?21? 22:21
> To: dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>;
> Jiang, Cheng1 <cheng1.jiang@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>;
> Ma, WenwuX <wenwux.ma@intel.com>
> Subject: [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring
> 
> This patch implements asynchronous dequeue data path for split ring.
> A new asynchronous dequeue function is introduced. With this function,
> the application can try to receive packets from the guest with offloading
> large copies to the DMA engine, thus saving precious CPU cycles.
> 
> v7:
> - Add Reviewed-by to comments in examples/vhost.
> 
> v6:
> - Allocate and free packets in bulk.
> - struct async_inflight_info aligned to 32 bits.
> - Change ASYNC_RX_VHOST to ASYNV_ENQUEUE_VHOST and
>   ASYNC_TX_VHOST to ASYNV_DEQUEUE_VHOST.
> 
> v5:
> - DMA address use IOVA instead of VA.
> 
> v4:
> - Fix wrong packet index issue in async dequeue improve
>   the performance of small packet copies.
> 
> v3:
> - Fix compilation warning and error in arm platform.
> - Restore the removed function virtio_dev_pktmbuf_alloc,
>   async dequeue allocate packets in separate.
> 
> v2:
> - Refactor vhost datapath as preliminary patch for this series.
> - The change of using new API in examples/vhost is put into a
>   dedicated patch.
> - Check queue_id value before using it.
> - Async dequeue performance enhancement. 160% performance
> improvement
>   for v2 vs. v1.
> - Async dequeue API name change from rte_vhost_try_dequeue_burst to
>   rte_vhost_async_try_dequeue_burst.
> - The completed package updates the used ring directly.
> 
> Wenwu Ma (3):
>   examples/vhost: refactor vhost enqueue and dequeue datapaths
>   examples/vhost: use a new API to query remaining ring space
>   examples/vhost: support vhost async dequeue data path
> 
> Yuan Wang (1):
>   vhost: support async dequeue for split ring
> 
>  doc/guides/prog_guide/vhost_lib.rst |   9 +
>  doc/guides/sample_app_ug/vhost.rst  |   9 +-
>  examples/vhost/ioat.c               |  67 +++-
>  examples/vhost/ioat.h               |  25 ++
>  examples/vhost/main.c               | 224 +++++++----
>  examples/vhost/main.h               |  33 +-
>  examples/vhost/virtio_net.c         |  16 +-
>  lib/vhost/rte_vhost_async.h         |  39 +-
>  lib/vhost/version.map               |   3 +
>  lib/vhost/virtio_net.c              | 586 ++++++++++++++++++++++++++++
>  10 files changed, 904 insertions(+), 107 deletions(-)
> 
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring
  2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
                   ` (5 preceding siblings ...)
  2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
@ 2021-07-21 14:20 ` Wenwu Ma
  2021-07-21  2:31   ` Wang, Yinan
                     ` (4 more replies)
  6 siblings, 5 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-21 14:20 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with offloading
large copies to the DMA engine, thus saving precious CPU cycles.

v7:
- Add Reviewed-by to comments in examples/vhost.

v6:
- Allocate and free packets in bulk.
- struct async_inflight_info aligned to 32 bits.
- Change ASYNC_RX_VHOST to ASYNV_ENQUEUE_VHOST and
  ASYNC_TX_VHOST to ASYNV_DEQUEUE_VHOST.

v5:
- DMA address use IOVA instead of VA.

v4:
- Fix wrong packet index issue in async dequeue improve
  the performance of small packet copies.

v3:
- Fix compilation warning and error in arm platform.
- Restore the removed function virtio_dev_pktmbuf_alloc,
  async dequeue allocate packets in separate.

v2:
- Refactor vhost datapath as preliminary patch for this series.
- The change of using new API in examples/vhost is put into a
  dedicated patch.
- Check queue_id value before using it.
- Async dequeue performance enhancement. 160% performance improvement
  for v2 vs. v1.
- Async dequeue API name change from rte_vhost_try_dequeue_burst to
  rte_vhost_async_try_dequeue_burst.
- The completed package updates the used ring directly.

Wenwu Ma (3):
  examples/vhost: refactor vhost enqueue and dequeue datapaths
  examples/vhost: use a new API to query remaining ring space
  examples/vhost: support vhost async dequeue data path

Yuan Wang (1):
  vhost: support async dequeue for split ring

 doc/guides/prog_guide/vhost_lib.rst |   9 +
 doc/guides/sample_app_ug/vhost.rst  |   9 +-
 examples/vhost/ioat.c               |  67 +++-
 examples/vhost/ioat.h               |  25 ++
 examples/vhost/main.c               | 224 +++++++----
 examples/vhost/main.h               |  33 +-
 examples/vhost/virtio_net.c         |  16 +-
 lib/vhost/rte_vhost_async.h         |  39 +-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 586 ++++++++++++++++++++++++++++
 10 files changed, 904 insertions(+), 107 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v7 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-21  2:31   ` Wang, Yinan
@ 2021-07-21 14:20   ` Wenwu Ma
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-21 14:20 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

Previously, by judging the flag, we call different enqueue/dequeue
functions in data path.

Now, we use an ops that was initialized when Vhost was created,
so that we can call ops directly in Vhost data path without any more
flag judgment.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 examples/vhost/main.c       | 112 ++++++++++++++++++++----------------
 examples/vhost/main.h       |  33 +++++++++--
 examples/vhost/virtio_net.c |  16 +++++-
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d2179eadb9..aebdc3a566 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
 static char *socket_files;
 static int nb_sockets;
 
+static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE];
+
 /* empty vmdq configuration structure. Filled in programatically */
 static struct rte_eth_conf vmdq_conf_default = {
 	.rxmode = {
@@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev)
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
 
-	if (builtin_net_driver) {
-		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[nr_xmit];
-
-		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
-
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = nr_xmit - ret;
-		if (enqueue_fail)
-			free_pkts(&m[ret], nr_xmit - ret);
-	} else {
-		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						m, nr_xmit);
-	}
+	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+					VIRTIO_RXQ, m, nr_xmit);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
@@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 }
 
+uint16_t
+async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	uint16_t enqueue_count;
+	uint32_t cpu_cpl_nr = 0;
+	uint16_t enqueue_fail = 0;
+	struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
+
+	complete_async_pkts(vdev);
+	enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
+				queue_id, pkts, rx_count,
+				m_cpu_cpl, &cpu_cpl_nr);
+	if (cpu_cpl_nr)
+		free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+	enqueue_fail = rx_count - enqueue_count;
+	if (enqueue_fail)
+		free_pkts(&pkts[enqueue_count], enqueue_fail);
+
+	return enqueue_count;
+}
+
+uint16_t
+sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t rx_count)
+{
+	return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count);
+}
+
 static __rte_always_inline void
 drain_eth_rx(struct vhost_dev *vdev)
 {
@@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev)
 		}
 	}
 
-	if (builtin_net_driver) {
-		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
-						pkts, rx_count);
-	} else if (async_vhost_driver) {
-		uint32_t cpu_cpl_nr = 0;
-		uint16_t enqueue_fail = 0;
-		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
-
-		complete_async_pkts(vdev);
-		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count,
-					m_cpu_cpl, &cpu_cpl_nr);
-		if (cpu_cpl_nr)
-			free_pkts(m_cpu_cpl, cpu_cpl_nr);
-
-		enqueue_fail = rx_count - enqueue_count;
-		if (enqueue_fail)
-			free_pkts(&pkts[enqueue_count], enqueue_fail);
-
-	} else {
-		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
-	}
+	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
+						VIRTIO_RXQ, pkts, rx_count);
 
 	if (enable_stats) {
 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
@@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count)
+{
+	return rte_vhost_dequeue_burst(dev->vid, queue_id,
+					mbuf_pool, pkts, count);
+}
+
 static __rte_always_inline void
 drain_virtio_tx(struct vhost_dev *vdev)
 {
@@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	if (builtin_net_driver) {
-		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
-					pkts, MAX_PKT_BURST);
-	} else {
-		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
-	}
+	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
+				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
@@ -1436,6 +1431,21 @@ new_device(int vid)
 		}
 	}
 
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (async_vhost_driver) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+							sync_enqueue_pkts;
+		}
+
+		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
+	}
+
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
 
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 0ccdce4b4a..7cd8a11a45 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -60,6 +60,19 @@ struct vhost_dev {
 	struct vhost_queue queues[MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;
 
+typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mbuf **pkts,
+			uint32_t count);
+
+typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev,
+			uint16_t queue_id, struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+
+struct vhost_queue_ops {
+	vhost_enqueue_burst_t enqueue_pkt_burst;
+	vhost_dequeue_burst_t dequeue_pkt_burst;
+};
+
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
 
 
@@ -84,9 +97,21 @@ struct lcore_info {
 void vs_vhost_net_setup(struct vhost_dev *dev);
 void vs_vhost_net_remove(struct vhost_dev *dev);
 uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+
+uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mbuf **pkts, uint32_t count);
+uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			 struct rte_mbuf **pkts, uint32_t count);
-
-uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
-			 struct rte_mempool *mbuf_pool,
-			 struct rte_mbuf **pkts, uint16_t count);
+uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
+uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			 struct rte_mbuf **pkts, uint32_t count);
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+			struct rte_mempool *mbuf_pool,
+			struct rte_mbuf **pkts, uint16_t count);
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c
index 9064fc3a82..2432a96566 100644
--- a/examples/vhost/virtio_net.c
+++ b/examples/vhost/virtio_net.c
@@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	return count;
 }
 
+uint16_t
+builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint32_t count)
+{
+	return vs_enqueue_pkts(dev, queue_id, pkts, count);
+}
+
 static __rte_always_inline int
 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	    struct rte_mbuf *m, uint16_t desc_idx,
@@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
 	return 0;
 }
 
-uint16_t
+static uint16_t
 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
@@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 
 	return i;
 }
+
+uint16_t
+builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+	return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v7 2/4] examples/vhost: use a new API to query remaining ring space
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
  2021-07-21  2:31   ` Wang, Yinan
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
@ 2021-07-21 14:20   ` Wenwu Ma
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 3/4] vhost: support async dequeue for split ring Wenwu Ma
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-21 14:20 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

A new API for querying the remaining descriptor ring capacity
is available, so we use the new one instead of the old one.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 examples/vhost/ioat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 2a2c2d7202..bf4e033bdb 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,7 +17,6 @@ struct packet_tracker {
 	unsigned short next_read;
 	unsigned short next_write;
 	unsigned short last_remain;
-	unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +112,6 @@ open_ioat(const char *value)
 			goto out;
 		}
 		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
 		dma_info->nr++;
 		i++;
 	}
@@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			src = descs[i_desc].src;
 			dst = descs[i_desc].dst;
 			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+			if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
 				break;
 			while (i_seg < src->nr_segs) {
 				rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 			}
 			write &= mask;
 			cb_tracker[dev_id].size_track[write] = src->nr_segs;
-			cb_tracker[dev_id].ioat_space -= src->nr_segs;
 			write++;
 		}
 	} else {
@@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		if (n_seg == 0)
 			return 0;
 
-		cb_tracker[dev_id].ioat_space += n_seg;
 		n_seg += cb_tracker[dev_id].last_remain;
 
 		read = cb_tracker[dev_id].next_read;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v7 3/4] vhost: support async dequeue for split ring
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
                     ` (2 preceding siblings ...)
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
@ 2021-07-21 14:20   ` Wenwu Ma
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-21 14:20 UTC (permalink / raw)
  To: dev
  Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Yuan Wang, Wenwu Ma

From: Yuan Wang <yuanx.wang@intel.com>

This patch implements asynchronous dequeue data path for split ring.
A new asynchronous dequeue function is introduced. With this function,
the application can try to receive packets from the guest with
offloading large copies to the async channel, thus saving precious CPU
cycles.

Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |   9 +
 lib/vhost/rte_vhost_async.h         |  39 +-
 lib/vhost/version.map               |   3 +
 lib/vhost/virtio_net.c              | 586 ++++++++++++++++++++++++++++
 4 files changed, 635 insertions(+), 2 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..bf90a2663b 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -281,6 +281,15 @@ The following is an overview of some key Vhost API functions:
   Poll enqueue completion status from async data path. Completed packets
   are returned to applications through ``pkts``.
 
+* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+  This function tries to receive packets from the guest with offloading
+  large copies to the async channel. The packets that are transfer completed
+  are returned in ``pkts``. The other packets that their copies are submitted
+  to the async channel but not completed are called "in-flight packets".
+  This function will not return in-flight packets until their copies are
+  completed by the async channel.
+
 Vhost-user Implementations
 --------------------------
 
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index 6faa31f5ad..04d7588217 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -83,12 +83,20 @@ struct rte_vhost_async_channel_ops {
 		uint16_t max_packets);
 };
 
+struct async_nethdr {
+	struct virtio_net_hdr hdr;
+	bool valid;
+};
+
 /**
- * inflight async packet information
+ * in-flight async packet information
  */
 struct async_inflight_info {
 	struct rte_mbuf *mbuf;
-	uint16_t descs; /* num of descs inflight */
+	union {
+		uint16_t descs; /* num of descs in-flight */
+		struct async_nethdr nethdr;
+	};
 	uint16_t nr_buffers; /* num of buffers inflight for packed ring */
 };
 
@@ -193,4 +201,31 @@ __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count);
 
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the async channel. The packets that are transfer completed
+ * are returned in "pkts". The other packets that their copies are submitted to
+ * the async channel but not completed are called "in-flight packets".
+ * This function will not return in-flight packets until their copies are
+ * completed by the async channel.
+ *
+ * @param vid
+ *  id of vhost device to dequeue data
+ * @param queue_id
+ *  queue id to dequeue data
+ * @param pkts
+ *  blank array to keep successfully dequeued packets
+ * @param count
+ *  size of the packet array
+ * @param nr_inflight
+ *  the amount of in-flight packets. If error occurred, its value is set to -1.
+ * @return
+ *  num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight);
+
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index 9103a23cd4..a320f889cd 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -79,4 +79,7 @@ EXPERIMENTAL {
 
 	# added in 21.05
 	rte_vhost_get_negotiated_protocol_features;
+
+	# added in 21.08
+	rte_vhost_async_try_dequeue_burst;
 };
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b93482587c..58317d7b75 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -3147,3 +3147,589 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	return count;
 }
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct buf_vector *buf_vec, uint16_t nr_vec,
+		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+		  struct iovec *src_iovec, struct iovec *dst_iovec,
+		  struct rte_vhost_iov_iter *src_it,
+		  struct rte_vhost_iov_iter *dst_it,
+		  struct async_nethdr *nethdr,
+		  bool legacy_ol_flags)
+{
+	uint64_t buf_addr, buf_iova;
+	uint64_t mapped_len;
+	uint32_t tlen = 0;
+	uint32_t buf_avail, buf_offset, buf_len;
+	uint32_t mbuf_avail, mbuf_offset;
+	uint32_t cpy_len, cpy_threshold;
+	/* A counter to avoid desc dead loop chain */
+	uint16_t vec_idx = 0;
+	int tvec_idx = 0;
+	struct rte_mbuf *cur = m, *prev = m;
+	struct virtio_net_hdr tmp_hdr;
+	struct virtio_net_hdr *hdr = NULL;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_len = buf_vec[vec_idx].buf_len;
+	buf_iova = buf_vec[vec_idx].buf_iova;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
+
+	cpy_threshold = vq->async_threshold;
+
+	if (virtio_net_with_host_offload(dev)) {
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			/*
+			 * No luck, the virtio-net header doesn't fit
+			 * in a contiguous virtual area.
+			 */
+			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+			hdr = &tmp_hdr;
+		} else {
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+		}
+	}
+
+	/*
+	 * A virtio driver normally uses at least 2 desc buffers
+	 * for Tx: the first for storing the header, and others
+	 * for storing the data.
+	 */
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
+		if (unlikely(++vec_idx >= nr_vec))
+			return -1;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_len = buf_vec[vec_idx].buf_len;
+
+		buf_offset = 0;
+		buf_avail = buf_len;
+	} else {
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+	}
+
+	PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
+
+	mbuf_offset = 0;
+	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+	while (1) {
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+		while (cpy_len && cpy_len >= cpy_threshold) {
+			void *hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+						buf_iova + buf_offset, cpy_len,
+						&mapped_len);
+
+			if (unlikely(!hpa || mapped_len < cpy_threshold))
+				break;
+
+			async_fill_vec(src_iovec + tvec_idx, hpa,
+				(size_t)mapped_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+				(void *)(uintptr_t)rte_pktmbuf_iova_offset(cur,
+							mbuf_offset),
+				(size_t)mapped_len);
+
+			tvec_idx++;
+			tlen += (uint32_t)mapped_len;
+			cpy_len -= (uint32_t)mapped_len;
+			mbuf_avail -= (uint32_t)mapped_len;
+			mbuf_offset += (uint32_t)mapped_len;
+			buf_avail -= (uint32_t)mapped_len;
+			buf_offset += (uint32_t)mapped_len;
+		}
+
+		if (cpy_len) {
+			if (vq->batch_copy_nb_elems >= vq->size ||
+				(hdr && cur == m)) {
+				rte_memcpy(
+					rte_pktmbuf_mtod_offset(cur, void *,
+							mbuf_offset),
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset)),
+					cpy_len);
+			} else {
+				batch_copy[vq->batch_copy_nb_elems].dst =
+					rte_pktmbuf_mtod_offset(cur, void *,
+							mbuf_offset);
+				batch_copy[vq->batch_copy_nb_elems].src =
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset));
+				batch_copy[vq->batch_copy_nb_elems].len =
+					cpy_len;
+				vq->batch_copy_nb_elems++;
+			}
+
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail -= cpy_len;
+			buf_offset += cpy_len;
+		}
+
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
+			if (++vec_idx >= nr_vec)
+				break;
+
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_len = buf_vec[vec_idx].buf_len;
+
+			buf_offset = 0;
+			buf_avail = buf_len;
+
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
+		}
+
+		/*
+		 * This mbuf reaches to its end, get a new one
+		 * to hold more data.
+		 */
+		if (mbuf_avail == 0) {
+			cur = rte_pktmbuf_alloc(mbuf_pool);
+			if (unlikely(cur == NULL)) {
+				VHOST_LOG_DATA(ERR, "Failed to "
+					"allocate memory for mbuf.\n");
+				return -1;
+			}
+
+			prev->next = cur;
+			prev->data_len = mbuf_offset;
+			m->nb_segs += 1;
+			m->pkt_len += mbuf_offset;
+			prev = cur;
+
+			mbuf_offset = 0;
+			mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+		}
+	}
+
+	prev->data_len = mbuf_offset;
+	m->pkt_len += mbuf_offset;
+
+	if (hdr && tlen) {
+		nethdr->valid = true;
+		nethdr->hdr = *hdr;
+	} else if (hdr)
+		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+	if (tlen) {
+		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+	} else
+		src_it->count = 0;
+
+	return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+	uint16_t start_idx, pkt_idx, from;
+	struct async_inflight_info *pkts_info;
+
+	pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+	pkts_info = vq->async_pkts_info;
+	start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+			vq->async_pkts_inflight_n);
+
+	if (count > vq->async_last_pkts_n) {
+		n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	}
+
+	n_pkts_cpl += vq->async_last_pkts_n;
+	if (unlikely(n_pkts_cpl == 0))
+		return 0;
+
+	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+	for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+		from = (start_idx + pkt_idx) & (vq->size - 1);
+		pkts[pkt_idx] = pkts_info[from].mbuf;
+
+		if (pkts_info[from].nethdr.valid) {
+			vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+					pkts[pkt_idx], legacy_ol_flags);
+		}
+	}
+	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+	if (n_pkts_put) {
+		/* write back completed descs to used ring */
+		write_back_completed_descs_split(vq, n_pkts_put);
+		/* update used ring */
+		__atomic_add_fetch(&vq->used->idx,
+				n_pkts_put, __ATOMIC_RELEASE);
+
+		vq->async_pkts_inflight_n -= n_pkts_put;
+	}
+
+	return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count, bool legacy_ol_flags)
+{
+	static bool allocerr_warned;
+	bool dropped = false;
+	uint16_t pkt_idx;
+	uint16_t free_entries;
+	uint16_t slot_idx = 0;
+	uint16_t segs_await = 0;
+	uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0;
+	uint16_t nr_async_burst = 0;
+	uint16_t pkt_err = 0;
+	uint16_t iovec_idx = 0, it_idx = 0;
+	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+	struct iovec *vec_pool = vq->vec_pool;
+	struct iovec *src_iovec = vec_pool;
+	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
+	struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST];
+
+	struct async_pkt_index {
+		uint16_t last_avail_idx;
+	} async_pkts_log[MAX_PKT_BURST];
+
+	/**
+	 * The ordering between avail index and
+	 * desc reads needs to be enforced.
+	 */
+	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+			vq->last_avail_idx;
+	if (free_entries == 0)
+		goto out;
+
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+	count = RTE_MIN(count, MAX_PKT_BURST);
+	count = RTE_MIN(count, free_entries);
+	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+			dev->vid, count);
+
+	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count))
+		goto out;
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint16_t head_idx = 0;
+		uint16_t nr_vec = 0;
+		uint32_t buf_len;
+		int err;
+		struct buf_vector buf_vec[BUF_VECTOR_MAX];
+		struct rte_mbuf *pkt;
+
+		if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+						&nr_vec, buf_vec,
+						&head_idx, &buf_len,
+						VHOST_ACCESS_RO) < 0))
+			break;
+
+		err = virtio_dev_pktmbuf_prep(dev, pkts_prealloc[pkt_idx],
+						buf_len);
+		if (unlikely(err)) {
+			/**
+			 * mbuf allocation fails for jumbo packets when external
+			 * buffer allocation is not allowed and linear buffer
+			 * is required. Drop this packet.
+			 */
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed mbuf alloc of size %d from %s on %s.\n",
+					buf_len, mbuf_pool->name, dev->ifname);
+				allocerr_warned = true;
+			}
+			dropped = true;
+			break;
+		}
+
+		pkt = pkts_prealloc[pkt_idx];
+
+		slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+				(vq->size - 1);
+		err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+				mbuf_pool, &src_iovec[iovec_idx],
+				&dst_iovec[iovec_idx], &it_pool[it_idx],
+				&it_pool[it_idx + 1],
+				&pkts_info[slot_idx].nethdr, legacy_ol_flags);
+		if (unlikely(err)) {
+			if (!allocerr_warned) {
+				VHOST_LOG_DATA(ERR,
+					"Failed to copy desc to mbuf on %s.\n",
+					dev->ifname);
+				allocerr_warned = true;
+			}
+			dropped = true;
+			break;
+		}
+
+		if (it_pool[it_idx].count) {
+			uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+			async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+				&it_pool[it_idx + 1]);
+			pkts_info[slot_idx].mbuf = pkt;
+			async_pkts_log[nr_async_pkts++].last_avail_idx =
+				vq->last_avail_idx;
+			nr_async_burst++;
+			iovec_idx += it_pool[it_idx].nr_segs;
+			it_idx += 2;
+			segs_await += it_pool[it_idx].nr_segs;
+
+			/* keep used desc */
+			vq->async_descs_split[to].id = head_idx;
+			vq->async_descs_split[to].len = 0;
+			vq->async_desc_idx_split++;
+		} else {
+			update_shadow_used_ring_split(vq, head_idx, 0);
+			pkts[nr_done_pkts++] = pkt;
+		}
+
+		vq->last_avail_idx++;
+
+		if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+					((VHOST_MAX_ASYNC_VEC >> 1) -
+					 segs_await < BUF_VECTOR_MAX))) {
+			uint16_t nr_pkts;
+
+			nr_pkts = vq->async_ops.transfer_data(dev->vid,
+					queue_id, tdes, 0, nr_async_burst);
+			src_iovec = vec_pool;
+			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+			it_idx = 0;
+			segs_await = 0;
+			vq->async_pkts_inflight_n += nr_pkts;
+
+			if (unlikely(nr_pkts < nr_async_burst)) {
+				pkt_err = nr_async_burst - nr_pkts;
+				nr_async_burst = 0;
+				break;
+			}
+			nr_async_burst = 0;
+		}
+	}
+
+	if (unlikely(dropped))
+		rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx);
+
+	if (nr_async_burst) {
+		uint32_t nr_pkts;
+
+		nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+				tdes, 0, nr_async_burst);
+		vq->async_pkts_inflight_n += nr_pkts;
+
+		if (unlikely(nr_pkts < nr_async_burst))
+			pkt_err = nr_async_burst - nr_pkts;
+	}
+
+	do_data_copy_dequeue(vq);
+
+	if (unlikely(pkt_err)) {
+		uint16_t nr_err_dma = pkt_err;
+		uint16_t nr_err_sw;
+
+		nr_async_pkts -= nr_err_dma;
+
+		/**
+		 * revert shadow used ring and free pktmbufs for
+		 * CPU-copied pkts after the first DMA-error pkt.
+		 */
+		nr_err_sw = vq->last_avail_idx -
+			async_pkts_log[nr_async_pkts].last_avail_idx -
+			nr_err_dma;
+		vq->shadow_used_idx -= nr_err_sw;
+		while (nr_err_sw-- > 0)
+			rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+		/**
+		 * recover DMA-copy related structures and free pktmbufs
+		 * for DMA-error pkts.
+		 */
+		vq->async_desc_idx_split -= nr_err_dma;
+		while (nr_err_dma-- > 0) {
+			rte_pktmbuf_free(
+				pkts_info[slot_idx & (vq->size - 1)].mbuf);
+			slot_idx--;
+		}
+
+		/* recover available ring */
+		vq->last_avail_idx =
+			async_pkts_log[nr_async_pkts].last_avail_idx;
+	}
+
+	vq->async_pkts_idx += nr_async_pkts;
+
+	if (likely(vq->shadow_used_idx))
+		flush_shadow_used_ring_split(dev, vq);
+
+out:
+	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
+		nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq,
+					queue_id, &pkts[nr_done_pkts],
+					count - nr_done_pkts,
+					legacy_ol_flags);
+		nr_done_pkts += nr_async_cmpl_pkts;
+	}
+	if (likely(nr_done_pkts))
+		vhost_vring_call_split(dev, vq);
+
+	return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint16_t queue_id,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t count)
+{
+	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+				pkts, count, false);
+}
+
+uint16_t
+rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+	int *nr_inflight)
+{
+	struct virtio_net *dev;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct vhost_virtqueue *vq;
+	int16_t success = 1;
+
+	*nr_inflight = -1;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: built-in vhost net backend is disabled.\n",
+			dev->vid, __func__);
+		return 0;
+	}
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+		VHOST_LOG_DATA(ERR,
+			"(%d) %s: invalid virtqueue idx %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+		return 0;
+
+	if (unlikely(vq->enabled == 0)) {
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		count = 0;
+		goto out_access_unlock;
+	}
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out_access_unlock;
+		}
+
+	/*
+	 * Construct a RARP broadcast packet, and inject it to the "pkts"
+	 * array, to looks like that guest actually send such packet.
+	 *
+	 * Check user_send_rarp() for more information.
+	 *
+	 * broadcast_rarp shares a cacheline in the virtio_net structure
+	 * with some fields that are accessed during enqueue and
+	 * __atomic_compare_exchange_n causes a write if performed compare
+	 * and exchange. This could result in false sharing between enqueue
+	 * and dequeue.
+	 *
+	 * Prevent unnecessary false sharing by reading broadcast_rarp first
+	 * and only performing compare and exchange if the read indicates it
+	 * is likely to be set.
+	 */
+	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+			__atomic_compare_exchange_n(&dev->broadcast_rarp,
+			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+		if (rarp_mbuf == NULL) {
+			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+			count = 0;
+			goto out;
+		}
+		count -= 1;
+	}
+
+	if (unlikely(vq_is_packed(dev)))
+		return 0;
+
+	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+	else
+		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+				mbuf_pool, pkts, count);
+
+out:
+	*nr_inflight = vq->async_pkts_inflight_n;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+	rte_spinlock_unlock(&vq->access_lock);
+
+	if (unlikely(rarp_mbuf != NULL)) {
+		/*
+		 * Inject it to the head of "pkts" array, so that switch's mac
+		 * learning table will get updated first.
+		 */
+		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+		pkts[0] = rarp_mbuf;
+		count += 1;
+	}
+
+	return count;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v7 4/4] examples/vhost: support vhost async dequeue data path
  2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
                     ` (3 preceding siblings ...)
  2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 3/4] vhost: support async dequeue for split ring Wenwu Ma
@ 2021-07-21 14:20   ` Wenwu Ma
  4 siblings, 0 replies; 50+ messages in thread
From: Wenwu Ma @ 2021-07-21 14:20 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, chenbo.xia, cheng1.jiang, jiayu.hu, Wenwu Ma

This patch is to add vhost async dequeue data-path in vhost sample.
vswitch can leverage IOAT to accelerate vhost async dequeue data-path.

Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 doc/guides/sample_app_ug/vhost.rst |   9 +-
 examples/vhost/ioat.c              |  61 ++++++++++---
 examples/vhost/ioat.h              |  25 ++++++
 examples/vhost/main.c              | 140 ++++++++++++++++++++---------
 4 files changed, 177 insertions(+), 58 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 9afde9c7f5..63dcf181e1 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas
 **--dmas**
 This parameter is used to specify the assigned DMA device of a vhost device.
 Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost
-device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1
-enqueue operation.
+--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
+and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
+operation. The index of the device corresponds to the socket file in order,
+that means vhost device 0 is created through the first socket file, vhost
+device 1 is created through the second socket file, and so on.
 
 Common Issues
 -------------
diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index bf4e033bdb..8bd379d084 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -21,6 +21,8 @@ struct packet_tracker {
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
 
+int vid2socketid[MAX_VHOST_DEVICE];
+
 int
 open_ioat(const char *value)
 {
@@ -29,7 +31,7 @@ open_ioat(const char *value)
 	char *addrs = input;
 	char *ptrs[2];
 	char *start, *end, *substr;
-	int64_t vid, vring_id;
+	int64_t socketid, vring_id;
 	struct rte_ioat_rawdev_config config;
 	struct rte_rawdev_info info = { .dev_private = &config };
 	char name[32];
@@ -60,6 +62,7 @@ open_ioat(const char *value)
 		goto out;
 	}
 	while (i < args_nr) {
+		bool is_txd;
 		char *arg_temp = dma_arg[i];
 		uint8_t sub_nr;
 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
@@ -68,27 +71,39 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
+		int async_flag;
+		char *txd, *rxd;
+		txd = strstr(ptrs[0], "txd");
+		rxd = strstr(ptrs[0], "rxd");
+		if (txd) {
+			is_txd = true;
+			start = txd;
+			async_flag = ASYNC_ENQUEUE_VHOST;
+		} else if (rxd) {
+			is_txd = false;
+			start = rxd;
+			async_flag = ASYNC_DEQUEUE_VHOST;
+		} else {
 			ret = -1;
 			goto out;
 		}
 
 		start += 3;
-		vid = strtol(start, &end, 0);
+		socketid = strtol(start, &end, 0);
 		if (end == start) {
 			ret = -1;
 			goto out;
 		}
 
-		vring_id = 0 + VIRTIO_RXQ;
+		vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
 		if (rte_pci_addr_parse(ptrs[1],
-				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
+			&(dma_info + socketid)->dmas[vring_id].addr) < 0) {
 			ret = -1;
 			goto out;
 		}
 
-		rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr,
+		rte_pci_device_name(&(dma_info + socketid)->dmas[vring_id].addr,
 				name, sizeof(name));
 		dev_id = rte_rawdev_get_dev_id(name);
 		if (dev_id == (uint16_t)(-ENODEV) ||
@@ -103,8 +118,9 @@ open_ioat(const char *value)
 			goto out;
 		}
 
-		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
-		(dma_info + vid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
+		(dma_info + socketid)->dmas[vring_id].is_valid = true;
+		(dma_info + socketid)->async_flag |= async_flag;
 		config.ring_size = IOAT_RING_SIZE;
 		config.hdls_disable = true;
 		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
@@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data, uint16_t count)
 {
 	uint32_t i_desc;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
 	struct rte_vhost_iov_iter *src = NULL;
 	struct rte_vhost_iov_iter *dst = NULL;
 	unsigned long i_seg;
 	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
 
+	if (queue_id >= MAX_RING_COUNT)
+		return -1;
+
+	uint16_t dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
+	unsigned short write = cb_tracker[dev_id].next_write;
 	if (!opaque_data) {
 		for (i_desc = 0; i_desc < count; i_desc++) {
 			src = descs[i_desc].src;
@@ -170,16 +189,16 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets)
 {
-	if (!opaque_data) {
+	if (!opaque_data && (queue_id < MAX_RING_COUNT)) {
 		uintptr_t dump[255];
 		int n_seg;
 		unsigned short read, write;
 		unsigned short nb_packet = 0;
 		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
 		unsigned short i;
+		uint16_t dev_id;
 
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
+		dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id;
 		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
 		if (n_seg < 0) {
 			RTE_LOG(ERR,
@@ -215,4 +234,18 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 	return -1;
 }
 
+uint32_t get_async_flag_by_vid(int vid)
+{
+	return dma_bind[vid2socketid[vid]].async_flag;
+}
+
+uint32_t get_async_flag_by_socketid(int socketid)
+{
+	return dma_bind[socketid].async_flag;
+}
+
+void init_vid2socketid_array(int vid, int socketid)
+{
+	vid2socketid[vid] = socketid;
+}
 #endif /* RTE_RAW_IOAT */
diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
index 1aa28ed6a3..3a85c94c8a 100644
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@@ -12,6 +12,9 @@
 #define MAX_VHOST_DEVICE 1024
 #define IOAT_RING_SIZE 4096
 #define MAX_ENQUEUED_SIZE 4096
+#define MAX_RING_COUNT	2
+#define ASYNC_ENQUEUE_VHOST	1
+#define ASYNC_DEQUEUE_VHOST	2
 
 struct dma_info {
 	struct rte_pci_addr addr;
@@ -20,6 +23,7 @@ struct dma_info {
 };
 
 struct dma_for_vhost {
+	int async_flag;
 	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
 	uint16_t nr;
 };
@@ -36,6 +40,10 @@ uint32_t
 ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
+
+uint32_t get_async_flag_by_vid(int vid);
+uint32_t get_async_flag_by_socketid(int socketid);
+void init_vid2socketid_array(int vid, int socketid);
 #else
 static int open_ioat(const char *value __rte_unused)
 {
@@ -59,5 +67,22 @@ ioat_check_completed_copies_cb(int vid __rte_unused,
 {
 	return -1;
 }
+
+static uint32_t
+get_async_flag_by_vid(int vid __rte_unused)
+{
+	return 0;
+}
+
+static uint32_t
+get_async_flag_by_socketid(int socketid __rte_unused)
+{
+	return 0;
+}
+
+static void
+init_vid2socketid_array(int vid __rte_unused, int socketid __rte_unused)
+{
+}
 #endif
 #endif /* _IOAT_H_ */
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index aebdc3a566..314184b447 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -93,8 +93,6 @@ static int client_mode;
 
 static int builtin_net_driver;
 
-static int async_vhost_driver;
-
 static char *dma_type;
 
 /* Specify timeout (in useconds) between retries on RX. */
@@ -679,7 +677,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;
 
 		case OPT_CLIENT_NUM:
@@ -897,7 +894,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_ENQUEUE_VHOST) == 0)
 		free_pkts(m, nr_xmit);
 }
 
@@ -1237,10 +1234,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}
 
-	if (!async_vhost_driver)
+	if ((get_async_flag_by_vid(vdev->vid) & ASYNC_ENQUEUE_VHOST) == 0)
 		free_pkts(pkts, rx_count);
 }
 
+uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
+				struct rte_mempool *mbuf_pool,
+				struct rte_mbuf **pkts, uint16_t count)
+{
+	int nr_inflight;
+	return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
+			mbuf_pool, pkts, count, &nr_inflight);
+}
+
 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
 			struct rte_mempool *mbuf_pool,
 			struct rte_mbuf **pkts, uint16_t count)
@@ -1392,12 +1398,90 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
 
-	if (async_vhost_driver)
+	if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST)
 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+	if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST)
+		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
 
 	rte_free(vdev);
 }
 
+static int
+get_socketid_by_vid(int vid)
+{
+	int i;
+	char ifname[PATH_MAX];
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
+
+	for (i = 0; i < nb_sockets; i++) {
+		char *file = socket_files + i * PATH_MAX;
+		if (strcmp(file, ifname) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int
+init_vhost_queue_ops(int vid)
+{
+	int socketid = get_socketid_by_vid(vid);
+	if (socketid == -1)
+		return -1;
+
+	init_vid2socketid_array(vid, socketid);
+	if (builtin_net_driver) {
+		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
+		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
+	} else {
+		if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						async_enqueue_pkts;
+		} else {
+			vdev_queue_ops[vid].enqueue_pkt_burst =
+						sync_enqueue_pkts;
+		}
+
+		if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						async_dequeue_pkts;
+		} else {
+			vdev_queue_ops[vid].dequeue_pkt_burst =
+						sync_dequeue_pkts;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_async_channel_register(int vid)
+{
+	int ret = 0;
+	struct rte_vhost_async_features f;
+	struct rte_vhost_async_channel_ops channel_ops;
+
+	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+		channel_ops.transfer_data = ioat_transfer_data_cb;
+		channel_ops.check_completed_copies =
+			ioat_check_completed_copies_cb;
+
+		f.async_inorder = 1;
+		f.async_threshold = 256;
+
+		if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
+					f.intval, &channel_ops);
+		}
+		if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) {
+			ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ,
+					f.intval, &channel_ops);
+		}
+	}
+
+	return ret;
+}
+
 /*
  * A new device is added to a data core. First the device is added to the main linked list
  * and then allocated to a specific data core.
@@ -1431,20 +1515,8 @@ new_device(int vid)
 		}
 	}
 
-	if (builtin_net_driver) {
-		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
-		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
-	} else {
-		if (async_vhost_driver) {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							async_enqueue_pkts;
-		} else {
-			vdev_queue_ops[vid].enqueue_pkt_burst =
-							sync_enqueue_pkts;
-		}
-
-		vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
-	}
+	if (init_vhost_queue_ops(vid) != 0)
+		return -1;
 
 	if (builtin_net_driver)
 		vs_vhost_net_setup(vdev);
@@ -1473,28 +1545,13 @@ new_device(int vid)
 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
+	int ret = vhost_async_channel_register(vid);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);
 
-	if (async_vhost_driver) {
-		struct rte_vhost_async_features f;
-		struct rte_vhost_async_channel_ops channel_ops;
-
-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			f.async_inorder = 1;
-			f.async_threshold = 256;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				f.intval, &channel_ops);
-		}
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1735,10 +1792,11 @@ main(int argc, char *argv[])
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
 
-		if (async_vhost_driver)
-			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
+		uint64_t flag = flags;
+		if (get_async_flag_by_socketid(i) != 0)
+			flag |= RTE_VHOST_USER_ASYNC_COPY;
 
-		ret = rte_vhost_driver_register(file, flags);
+		ret = rte_vhost_driver_register(file, flag);
 		if (ret != 0) {
 			unregister_drivers(i);
 			rte_exit(EXIT_FAILURE,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2021-07-21  2:31 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-02  8:31 [dpdk-dev] [PATCH 0/1] lib/vhost: support async dequeue for split ring Yuan Wang
2021-06-02  8:31 ` [dpdk-dev] [PATCH 1/1] " Yuan Wang
2021-06-07 16:17   ` Maxime Coquelin
2021-06-09  1:21     ` Hu, Jiayu
2021-06-18 20:03 ` [dpdk-dev] [PATCH v2 0/4] vhost: " Wenwu Ma
2021-06-18 14:10   ` Maxime Coquelin
2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 3/4] vhost: support async dequeue for split ring Wenwu Ma
2021-06-18 20:03   ` [dpdk-dev] [PATCH v2 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
2021-06-23 15:00 ` [dpdk-dev] [PATCH v3 0/4] vhost: support async dequeue for split ring Wenwu Ma
2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 3/4] vhost: support async dequeue for split ring Wenwu Ma
2021-06-23 15:00   ` [dpdk-dev] [PATCH v3 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
2021-06-30 19:27 ` [dpdk-dev] [PATCH v4 0/4] support async dequeue for split ring Wenwu Ma
2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 3/4] vhost: support async dequeue for split ring Wenwu Ma
2021-06-30 19:27   ` [dpdk-dev] [PATCH v4 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
2021-07-05 18:11 ` [dpdk-dev] [PATCH v5 0/4] support async dequeue for split ring Wenwu Ma
2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
2021-07-13 13:34     ` Maxime Coquelin
2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
2021-07-13 13:36     ` Maxime Coquelin
2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 3/4] vhost: support async dequeue for split ring Wenwu Ma
2021-07-13 14:30     ` Maxime Coquelin
2021-07-14  6:50       ` Hu, Jiayu
2021-07-15 13:18         ` Maxime Coquelin
2021-07-16  1:10           ` Hu, Jiayu
2021-07-16  7:45             ` Maxime Coquelin
2021-07-16  7:55               ` Hu, Jiayu
2021-07-16  9:02                 ` Maxime Coquelin
2021-07-16  8:14         ` David Marchand
2021-07-16 13:45           ` Hu, Jiayu
2021-07-16 13:52             ` David Marchand
2021-07-16 14:00               ` Hu, Jiayu
2021-07-05 18:11   ` [dpdk-dev] [PATCH v5 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
2021-07-13 17:01     ` Maxime Coquelin
2021-07-16 19:18 ` [dpdk-dev] [PATCH v6 0/4] support async dequeue for split ring Wenwu Ma
2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 3/4] vhost: support async dequeue for split ring Wenwu Ma
2021-07-16 19:18   ` [dpdk-dev] [PATCH v6 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma
2021-07-21 14:20 ` [dpdk-dev] [PATCH v7 0/4] support async dequeue for split ring Wenwu Ma
2021-07-21  2:31   ` Wang, Yinan
2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths Wenwu Ma
2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 2/4] examples/vhost: use a new API to query remaining ring space Wenwu Ma
2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 3/4] vhost: support async dequeue for split ring Wenwu Ma
2021-07-21 14:20   ` [dpdk-dev] [PATCH v7 4/4] examples/vhost: support vhost async dequeue data path Wenwu Ma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).