DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v1 0/4] optimize async data path
@ 2020-09-11  1:53 Patrick Fu
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion Patrick Fu
                   ` (6 more replies)
  0 siblings, 7 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-11  1:53 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

This series applies optimization and fixes to the vhost
async data path.

Patrick Fu (4):
  vhost: simplify async copy completion
  vhost: dynamically alloc async memory
  vhost: fix async vec buf overrun
  vhost: fix async register/unregister deadlock

 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  66 ++++++------
 lib/librte_vhost/vhost.h           |  14 +--
 lib/librte_vhost/vhost_user.c      |  10 +-
 lib/librte_vhost/virtio_net.c      | 156 ++++++++++++-----------------
 5 files changed, 127 insertions(+), 134 deletions(-)

-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
@ 2020-09-11  1:53 ` Patrick Fu
  2020-09-23  9:07   ` Maxime Coquelin
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory Patrick Fu
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-11  1:53 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Current async ops allows check_completed_copies() callback to return
arbitrary number of async iov segments finished from backend async
devices. This design creates complexity for vhost to handle breaking
transfer of a single packet (i.e. transfer completes in the middle
of a async descriptor) and prevents application callbacks from
leveraging hardware capability to offload the work. Thus, this patch
enforces the check_completed_copies() callback to return the number
of async memory descriptors, which is aligned with async transfer
data ops callbacks. vHost async data path are revised to work with
new ops define, which provides a clean and simplified processing.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  20 ++--
 lib/librte_vhost/vhost.h           |   8 +-
 lib/librte_vhost/vhost_user.c      |   6 +-
 lib/librte_vhost/virtio_net.c      | 149 ++++++++++++-----------------
 5 files changed, 88 insertions(+), 110 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index cb6284539..c73bd7c99 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -76,13 +76,26 @@ struct rte_vhost_async_channel_ops {
 	 * @param max_packets
 	 *  max number of packets could be completed
 	 * @return
-	 *  number of iov segments completed
+	 *  number of async descs completed
 	 */
 	uint32_t (*check_completed_copies)(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
 };
 
+/**
+ * inflight async packet information
+ */
+struct async_inflight_info {
+	union {
+		uint32_t info;
+		struct {
+			uint16_t descs; /* num of descs inflight */
+			uint16_t segs; /* iov segs inflight */
+		};
+	};
+};
+
 /**
  *  dma channel feature bit definition
  */
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 8f20a0818..eca507836 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -333,8 +333,8 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_split);
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1573,15 +1573,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_pending = rte_malloc(NULL,
 			vq->size * sizeof(uintptr_t),
 			RTE_CACHE_LINE_SIZE);
-	vq->async_pending_info = rte_malloc(NULL,
-			vq->size * sizeof(uint64_t),
+	vq->async_pkts_info = rte_malloc(NULL,
+			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pending_info) {
+	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
 
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
@@ -1635,9 +1635,9 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		vq->async_pkts_pending = NULL;
 	}
 
-	if (vq->async_pending_info) {
-		rte_free(vq->async_pending_info);
-		vq->async_pending_info = NULL;
+	if (vq->async_pkts_info) {
+		rte_free(vq->async_pkts_info);
+		vq->async_pkts_info = NULL;
 	}
 
 	vq->async_ops.transfer_data = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 632f66d53..28aa77380 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -46,8 +46,6 @@
 
 #define MAX_PKT_BURST 32
 
-#define ASYNC_MAX_POLL_SEG 255
-
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
 #define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
 
@@ -225,12 +223,10 @@ struct vhost_virtqueue {
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
-	#define		ASYNC_PENDING_INFO_N_SFT 16
-	uint64_t	*async_pending_info;
+	struct async_inflight_info *async_pkts_info;
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
-	uint16_t	async_last_seg_n;
+	uint16_t	async_last_pkts_n;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index c3c924fae..f6cdbede7 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -2006,10 +2006,10 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
 		vq->shadow_used_split = NULL;
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 		vq->async_pkts_pending = NULL;
-		vq->async_pending_info = NULL;
+		vq->async_pkts_info = NULL;
 	}
 
 	rte_free(vq->batch_copy_elems);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index bd9303c8a..70c3377fb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1473,37 +1473,6 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
-static __rte_always_inline void
-virtio_dev_rx_async_submit_split_err(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
-	uint16_t last_idx, uint16_t shadow_idx)
-{
-	uint16_t start_idx, pkts_idx, vq_size;
-	uint64_t *async_pending_info;
-
-	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
-	vq_size = vq->size;
-	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
-		vq_size, vq->async_pkts_inflight_n);
-
-	while (likely((start_idx & (vq_size - 1)) != pkts_idx)) {
-		uint64_t n_seg =
-			async_pending_info[(start_idx) & (vq_size - 1)] >>
-			ASYNC_PENDING_INFO_N_SFT;
-
-		while (n_seg)
-			n_seg -= vq->async_ops.check_completed_copies(dev->vid,
-				queue_id, 0, 1);
-	}
-
-	vq->async_pkts_inflight_n = 0;
-	vq->batch_copy_nb_elems = 0;
-
-	vq->shadow_used_idx = shadow_idx;
-	vq->last_avail_idx = last_idx;
-}
-
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1512,7 +1481,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
 	uint16_t num_buffers;
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-	uint16_t avail_head, last_idx, shadow_idx;
+	uint16_t avail_head;
 
 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
 	struct iovec *vec_pool = vq->vec_pool;
@@ -1522,11 +1491,11 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *src_it = it_pool;
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
+	uint16_t pkt_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
-	last_idx = vq->last_avail_idx;
-	shadow_idx = vq->shadow_used_idx;
 
 	/*
 	 * The ordering between avail index and
@@ -1565,14 +1534,14 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 		if (src_it->count) {
 			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
 			pkt_burst_idx++;
-			vq->async_pending_info[slot_idx] =
-				num_buffers | (src_it->nr_segs << 16);
+			pkts_info[slot_idx].descs = num_buffers;
+			pkts_info[slot_idx].segs = src_it->nr_segs;
 			src_iovec += src_it->nr_segs;
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
 		} else {
-			vq->async_pending_info[slot_idx] = num_buffers;
+			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
 		}
 
@@ -1586,35 +1555,46 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-				vq->async_pkts_inflight_n +=
-					n_pkts > 0 ? n_pkts : 0;
-				virtio_dev_rx_async_submit_split_err(dev,
-					vq, queue_id, last_idx, shadow_idx);
-				return 0;
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				break;
 			}
 
 			pkt_burst_idx = 0;
-			vq->async_pkts_inflight_n += n_pkts;
 		}
 	}
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid,
 				queue_id, tdes, 0, pkt_burst_idx);
-		if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-			vq->async_pkts_inflight_n += n_pkts > 0 ? n_pkts : 0;
-			virtio_dev_rx_async_submit_split_err(dev, vq, queue_id,
-				last_idx, shadow_idx);
-			return 0;
-		}
-
 		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < (int)pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
 	}
 
 	do_data_copy_enqueue(dev, vq);
 
+	if (unlikely(pkt_err)) {
+		do {
+			if (pkts_info[slot_idx].segs)
+				pkt_err--;
+			vq->last_avail_idx -= pkts_info[slot_idx].descs;
+			vq->shadow_used_idx -= pkts_info[slot_idx].descs;
+			vq->async_pkts_inflight_n--;
+			slot_idx--;
+			pkt_idx--;
+		} while (pkt_err);
+	}
+
 	n_free_slot = vq->size - vq->async_pkts_idx;
 	if (n_free_slot > pkt_idx) {
 		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
@@ -1640,10 +1620,10 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_segs_cpl, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	uint16_t n_inflight;
-	uint64_t *async_pending_info;
+	struct async_inflight_info *pkts_info;
 
 	if (!dev)
 		return 0;
@@ -1657,51 +1637,38 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
 	rte_spinlock_lock(&vq->access_lock);
 
 	n_inflight = vq->async_pkts_inflight_n;
 	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
+	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
 		vq_size, vq->async_pkts_inflight_n);
 
-	n_segs_cpl = vq->async_ops.check_completed_copies(vid, queue_id,
-		0, ASYNC_MAX_POLL_SEG - vq->async_last_seg_n) +
-		vq->async_last_seg_n;
+	if (count > vq->async_last_pkts_n)
+		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	n_pkts_cpl += vq->async_last_pkts_n;
 
 	rte_smp_wmb();
 
 	while (likely((n_pkts_put < count) && n_inflight)) {
-		uint64_t info = async_pending_info[
-			(start_idx + n_pkts_put) & (vq_size - 1)];
-		uint64_t n_segs;
+		uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
 		n_pkts_put++;
 		n_inflight--;
-		n_descs += info & ASYNC_PENDING_INFO_N_MSK;
-		n_segs = info >> ASYNC_PENDING_INFO_N_SFT;
-
-		if (n_segs) {
-			if (unlikely(n_segs_cpl < n_segs)) {
-				n_pkts_put--;
-				n_inflight++;
-				n_descs -= info & ASYNC_PENDING_INFO_N_MSK;
-				if (n_segs_cpl) {
-					async_pending_info[
-						(start_idx + n_pkts_put) &
-						(vq_size - 1)] =
-					((n_segs - n_segs_cpl) <<
-					 ASYNC_PENDING_INFO_N_SFT) |
-					(info & ASYNC_PENDING_INFO_N_MSK);
-					n_segs_cpl = 0;
-				}
-				break;
-			}
-			n_segs_cpl -= n_segs;
-		}
+		n_descs += pkts_info[info_idx].descs;
+		if (pkts_info[info_idx].segs)
+			n_pkts_cpl--;
 	}
 
-	vq->async_last_seg_n = n_segs_cpl;
+	vq->async_last_pkts_n = n_pkts_cpl;
 
 	if (n_pkts_put) {
 		vq->async_pkts_inflight_n = n_inflight;
@@ -1710,16 +1677,18 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 					n_descs, __ATOMIC_RELEASE);
 			vhost_vring_call_split(dev, vq);
 		}
-	}
 
-	if (start_idx + n_pkts_put <= vq_size) {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			n_pkts_put * sizeof(uintptr_t));
-	} else {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			(vq_size - start_idx) * sizeof(uintptr_t));
-		rte_memcpy(&pkts[vq_size - start_idx], vq->async_pkts_pending,
-			(n_pkts_put - vq_size + start_idx) * sizeof(uintptr_t));
+		if (start_idx + n_pkts_put <= vq_size) {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				n_pkts_put * sizeof(uintptr_t));
+		} else {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				(vq_size - start_idx) * sizeof(uintptr_t));
+			rte_memcpy(&pkts[vq_size - start_idx],
+				vq->async_pkts_pending,
+				(n_pkts_put + start_idx - vq_size) *
+				sizeof(uintptr_t));
+		}
 	}
 
 	rte_spinlock_unlock(&vq->access_lock);
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-09-11  1:53 ` Patrick Fu
  2020-09-23  9:15   ` Maxime Coquelin
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun Patrick Fu
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-11  1:53 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

alloc async internal memory buffer by rte_malloc(), replacing array
declaration inside vq structure. Dynamic allocation can help to save
memory footprint when async path is not registered.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c | 49 ++++++++++++++++++++++++----------------
 lib/librte_vhost/vhost.h |  4 ++--
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index eca507836..ba374da67 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -324,6 +324,24 @@ cleanup_device(struct virtio_net *dev, int destroy)
 	}
 }
 
+static void
+vhost_free_async_mem(struct vhost_virtqueue *vq)
+{
+	if (vq->async_pkts_pending)
+		rte_free(vq->async_pkts_pending);
+	if (vq->async_pkts_info)
+		rte_free(vq->async_pkts_info);
+	if (vq->it_pool)
+		rte_free(vq->it_pool);
+	if (vq->vec_pool)
+		rte_free(vq->vec_pool);
+
+	vq->async_pkts_pending = NULL;
+	vq->async_pkts_info = NULL;
+	vq->it_pool = NULL;
+	vq->vec_pool = NULL;
+}
+
 void
 free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -331,10 +349,7 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_packed);
 	else {
 		rte_free(vq->shadow_used_split);
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
+		vhost_free_async_mem(vq);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1576,13 +1591,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_info = rte_malloc(NULL,
 			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
-
+	vq->it_pool = rte_malloc(NULL,
+			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
+			RTE_CACHE_LINE_SIZE);
+	vq->vec_pool = rte_malloc(NULL,
+			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
+			RTE_CACHE_LINE_SIZE);
+	if (!vq->async_pkts_pending || !vq->async_pkts_info ||
+		!vq->it_pool || !vq->vec_pool) {
+		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
 				"(vid %d, qid: %d)\n", vid, queue_id);
@@ -1630,15 +1647,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		goto out;
 	}
 
-	if (vq->async_pkts_pending) {
-		rte_free(vq->async_pkts_pending);
-		vq->async_pkts_pending = NULL;
-	}
-
-	if (vq->async_pkts_info) {
-		rte_free(vq->async_pkts_info);
-		vq->async_pkts_info = NULL;
-	}
+	vhost_free_async_mem(vq);
 
 	vq->async_ops.transfer_data = NULL;
 	vq->async_ops.check_completed_copies = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 28aa77380..0af0ac23d 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -218,8 +218,8 @@ struct vhost_virtqueue {
 	/* operation callbacks for async dma */
 	struct rte_vhost_async_channel_ops	async_ops;
 
-	struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
-	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
+	struct rte_vhost_iov_iter *it_pool;
+	struct iovec *vec_pool;
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion Patrick Fu
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory Patrick Fu
@ 2020-09-11  1:53 ` Patrick Fu
  2020-09-23  9:21   ` Maxime Coquelin
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 4/4] vhost: fix async register/unregister deadlock Patrick Fu
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-11  1:53 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Add check on the async vec buffer usage to prevent the buf overrun.
If vec buf is not sufficient to prepare for next packet's iov
creation, an async transfer will be triggered immediately to free
the vec buf.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.h      | 2 +-
 lib/librte_vhost/virtio_net.c | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 0af0ac23d..5e669e28f 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -47,7 +47,7 @@
 #define MAX_PKT_BURST 32
 
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
-#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 4)
 
 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 70c3377fb..18b91836a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1492,6 +1492,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
 	uint16_t pkt_err = 0;
+	uint16_t segs_await = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
@@ -1540,6 +1541,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
+			segs_await += src_it->nr_segs;
 		} else {
 			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
@@ -1548,13 +1550,16 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 		vq->last_avail_idx += num_buffers;
 
 		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
-				(pkt_idx == count - 1 && pkt_burst_idx)) {
+				(pkt_idx == count - 1 && pkt_burst_idx) ||
+				VHOST_MAX_ASYNC_VEC / 2 - segs_await <
+				BUF_VECTOR_MAX) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
 			src_iovec = vec_pool;
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v1 4/4] vhost: fix async register/unregister deadlock
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
                   ` (2 preceding siblings ...)
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun Patrick Fu
@ 2020-09-11  1:53 ` Patrick Fu
  2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-11  1:53 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

When async register/unregister function is invoked in certain vhost
event callbacks (e.g. vring state change), deadlock may occur due to
recursive spinlock acquire. This patch removes unnecessary spinlock
from register API and use trylock() primitive in the unregister API
to avoid deadlock.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c      | 13 +++++++------
 lib/librte_vhost/vhost_user.c |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index ba374da67..8c2fee6b6 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -1576,8 +1576,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		ops->transfer_data == NULL))
 		return -1;
 
-	rte_spinlock_lock(&vq->access_lock);
-
 	if (unlikely(vq->async_registered)) {
 		VHOST_LOG_CONFIG(ERR,
 			"async register failed: channel already registered "
@@ -1615,8 +1613,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_registered = true;
 
 reg_out:
-	rte_spinlock_unlock(&vq->access_lock);
-
 	return 0;
 }
 
@@ -1635,10 +1631,15 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		return ret;
 
 	ret = 0;
-	rte_spinlock_lock(&vq->access_lock);
 
 	if (!vq->async_registered)
-		goto out;
+		return ret;
+
+	if (!rte_spinlock_trylock(&vq->access_lock)) {
+		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
+			"virt queue busy.\n");
+		return -1;
+	}
 
 	if (vq->async_pkts_inflight_n) {
 		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index f6cdbede7..39cd57aeb 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -2043,9 +2043,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
 		"set queue enable: %d to qp idx: %d\n",
 		enable, index);
 
-	if (!enable && dev->virtqueue[index]->async_registered) {
+	if (enable && dev->virtqueue[index]->async_registered) {
 		if (dev->virtqueue[index]->async_pkts_inflight_n) {
-			VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
+			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
 			"async inflight packets must be completed first\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-09-23  9:07   ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-09-23  9:07 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 9/11/20 3:53 AM, Patrick Fu wrote:
> Current async ops allows check_completed_copies() callback to return
> arbitrary number of async iov segments finished from backend async
> devices. This design creates complexity for vhost to handle breaking
> transfer of a single packet (i.e. transfer completes in the middle
> of a async descriptor) and prevents application callbacks from
> leveraging hardware capability to offload the work. Thus, this patch
> enforces the check_completed_copies() callback to return the number
> of async memory descriptors, which is aligned with async transfer
> data ops callbacks. vHost async data path are revised to work with
> new ops define, which provides a clean and simplified processing.
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |  15 ++-
>  lib/librte_vhost/vhost.c           |  20 ++--
>  lib/librte_vhost/vhost.h           |   8 +-
>  lib/librte_vhost/vhost_user.c      |   6 +-
>  lib/librte_vhost/virtio_net.c      | 149 ++++++++++++-----------------
>  5 files changed, 88 insertions(+), 110 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory Patrick Fu
@ 2020-09-23  9:15   ` Maxime Coquelin
  2020-09-29  5:55     ` Fu, Patrick
  0 siblings, 1 reply; 36+ messages in thread
From: Maxime Coquelin @ 2020-09-23  9:15 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang

s/alloc/allocate/

On 9/11/20 3:53 AM, Patrick Fu wrote:
> alloc async internal memory buffer by rte_malloc(), replacing array

Allocate async internal memory buffer with rte_malloc()

> declaration inside vq structure. Dynamic allocation can help to save
> memory footprint when async path is not registered.
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.c | 49 ++++++++++++++++++++++++----------------
>  lib/librte_vhost/vhost.h |  4 ++--
>  2 files changed, 31 insertions(+), 22 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index eca507836..ba374da67 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -324,6 +324,24 @@ cleanup_device(struct virtio_net *dev, int destroy)
>  	}
>  }
>  
> +static void
> +vhost_free_async_mem(struct vhost_virtqueue *vq)
> +{
> +	if (vq->async_pkts_pending)
> +		rte_free(vq->async_pkts_pending);
> +	if (vq->async_pkts_info)
> +		rte_free(vq->async_pkts_info);
> +	if (vq->it_pool)
> +		rte_free(vq->it_pool);
> +	if (vq->vec_pool)
> +		rte_free(vq->vec_pool);
> +
> +	vq->async_pkts_pending = NULL;
> +	vq->async_pkts_info = NULL;
> +	vq->it_pool = NULL;
> +	vq->vec_pool = NULL;
> +}
> +
>  void
>  free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> @@ -331,10 +349,7 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  		rte_free(vq->shadow_used_packed);
>  	else {
>  		rte_free(vq->shadow_used_split);
> -		if (vq->async_pkts_pending)
> -			rte_free(vq->async_pkts_pending);
> -		if (vq->async_pkts_info)
> -			rte_free(vq->async_pkts_info);
> +		vhost_free_async_mem(vq);
>  	}
>  	rte_free(vq->batch_copy_elems);
>  	rte_mempool_free(vq->iotlb_pool);
> @@ -1576,13 +1591,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  	vq->async_pkts_info = rte_malloc(NULL,
>  			vq->size * sizeof(struct async_inflight_info),
>  			RTE_CACHE_LINE_SIZE);
> -	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
> -		if (vq->async_pkts_pending)
> -			rte_free(vq->async_pkts_pending);
> -
> -		if (vq->async_pkts_info)
> -			rte_free(vq->async_pkts_info);
> -
> +	vq->it_pool = rte_malloc(NULL,
> +			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
> +			RTE_CACHE_LINE_SIZE);
> +	vq->vec_pool = rte_malloc(NULL,
> +			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
> +			RTE_CACHE_LINE_SIZE);
> +	if (!vq->async_pkts_pending || !vq->async_pkts_info ||
> +		!vq->it_pool || !vq->vec_pool) {
> +		vhost_free_async_mem(vq);
>  		VHOST_LOG_CONFIG(ERR,
>  				"async register failed: cannot allocate memory for vq data "
>  				"(vid %d, qid: %d)\n", vid, queue_id);
> @@ -1630,15 +1647,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>  		goto out;
>  	}
>  
> -	if (vq->async_pkts_pending) {
> -		rte_free(vq->async_pkts_pending);
> -		vq->async_pkts_pending = NULL;
> -	}
> -
> -	if (vq->async_pkts_info) {
> -		rte_free(vq->async_pkts_info);
> -		vq->async_pkts_info = NULL;
> -	}
> +	vhost_free_async_mem(vq);
>  
>  	vq->async_ops.transfer_data = NULL;
>  	vq->async_ops.check_completed_copies = NULL;
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 28aa77380..0af0ac23d 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -218,8 +218,8 @@ struct vhost_virtqueue {
>  	/* operation callbacks for async dma */
>  	struct rte_vhost_async_channel_ops	async_ops;
>  
> -	struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
> -	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> +	struct rte_vhost_iov_iter *it_pool;
> +	struct iovec *vec_pool;
>  
>  	/* async data transfer status */
>  	uintptr_t	**async_pkts_pending;
> 

I think you should also take care of reallocating on the same numa node
the ring is (seel numa_realloc().


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun Patrick Fu
@ 2020-09-23  9:21   ` Maxime Coquelin
  2020-09-29  2:23     ` Fu, Patrick
  0 siblings, 1 reply; 36+ messages in thread
From: Maxime Coquelin @ 2020-09-23  9:21 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang

s/buf/buffer/

On 9/11/20 3:53 AM, Patrick Fu wrote:
> Add check on the async vec buffer usage to prevent the buf overrun.

s/vec/vector/

s/buf/buffer/


> If vec buf is not sufficient to prepare for next packet's iov

same here

> creation, an async transfer will be triggered immediately to free
> the vec buf.
> 
> Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.h      | 2 +-
>  lib/librte_vhost/virtio_net.c | 7 ++++++-
>  2 files changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 0af0ac23d..5e669e28f 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -47,7 +47,7 @@
>  #define MAX_PKT_BURST 32
>  
>  #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
> -#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> +#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 4)
>  
>  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
>  	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 70c3377fb..18b91836a 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1492,6 +1492,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
>  	uint16_t n_free_slot, slot_idx;
>  	uint16_t pkt_err = 0;
> +	uint16_t segs_await = 0;
>  	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>  	int n_pkts = 0;
>  
> @@ -1540,6 +1541,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  			dst_iovec += dst_it->nr_segs;
>  			src_it += 2;
>  			dst_it += 2;
> +			segs_await += src_it->nr_segs;
>  		} else {
>  			pkts_info[slot_idx].info = num_buffers;
>  			vq->async_pkts_inflight_n++;
> @@ -1548,13 +1550,16 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  		vq->last_avail_idx += num_buffers;
>  
>  		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
> -				(pkt_idx == count - 1 && pkt_burst_idx)) {
> +				(pkt_idx == count - 1 && pkt_burst_idx) ||
> +				VHOST_MAX_ASYNC_VEC / 2 - segs_await <
> +				BUF_VECTOR_MAX) {

Parenthesis may help.

Or better to refactor the code for better readability.

>  			n_pkts = vq->async_ops.transfer_data(dev->vid,
>  					queue_id, tdes, 0, pkt_burst_idx);
>  			src_iovec = vec_pool;
>  			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
>  			src_it = it_pool;
>  			dst_it = it_pool + 1;
> +			segs_await = 0;
>  			vq->async_pkts_inflight_n += n_pkts;
>  
>  			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
> 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun
  2020-09-23  9:21   ` Maxime Coquelin
@ 2020-09-29  2:23     ` Fu, Patrick
  0 siblings, 0 replies; 36+ messages in thread
From: Fu, Patrick @ 2020-09-29  2:23 UTC (permalink / raw)
  To: Maxime Coquelin, dev, Xia, Chenbo; +Cc: Wang, Zhihong, Jiang, Cheng1



> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Wednesday, September 23, 2020 5:22 PM
> To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>
> Subject: Re: [PATCH v1 3/4] vhost: fix async vec buf overrun
> 
> s/buf/buffer/
> 
> On 9/11/20 3:53 AM, Patrick Fu wrote:
> > Add check on the async vec buffer usage to prevent the buf overrun.
> 
> s/vec/vector/
> 
> s/buf/buffer/
> 
> 
> > If vec buf is not sufficient to prepare for next packet's iov
> 
> same here
> 
Fix in v2

> >  		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
> > -				(pkt_idx == count - 1 && pkt_burst_idx)) {
> > +				(pkt_idx == count - 1 && pkt_burst_idx) ||
> > +				VHOST_MAX_ASYNC_VEC / 2 - segs_await <
> > +				BUF_VECTOR_MAX) {
> 
> Parenthesis may help.
> 
> Or better to refactor the code for better readability.
> 
Refactor the code seems to be a little bit tricky. I will add parenthesis together with some comments to improve readability.

Thanks,

Patrick

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory
  2020-09-23  9:15   ` Maxime Coquelin
@ 2020-09-29  5:55     ` Fu, Patrick
  0 siblings, 0 replies; 36+ messages in thread
From: Fu, Patrick @ 2020-09-29  5:55 UTC (permalink / raw)
  To: Maxime Coquelin, dev, Xia, Chenbo; +Cc: Wang, Zhihong, Jiang, Cheng1



> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Wednesday, September 23, 2020 5:15 PM
> To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>
> Subject: Re: [PATCH v1 2/4] vhost: dynamically alloc async memory
> 
> s/alloc/allocate/
> 
Fix in v2

> On 9/11/20 3:53 AM, Patrick Fu wrote:
> > alloc async internal memory buffer by rte_malloc(), replacing array
> 
> Allocate async internal memory buffer with rte_malloc()
> 
Fix in v2

> > index 28aa77380..0af0ac23d 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -218,8 +218,8 @@ struct vhost_virtqueue {
> >  	/* operation callbacks for async dma */
> >  	struct rte_vhost_async_channel_ops	async_ops;
> >
> > -	struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
> > -	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> > +	struct rte_vhost_iov_iter *it_pool;
> > +	struct iovec *vec_pool;
> >
> >  	/* async data transfer status */
> >  	uintptr_t	**async_pkts_pending;
> >
> 
> I think you should also take care of reallocating on the same numa node
> the ring is (seel numa_realloc().
Agree, will add numa based allocation in v2

Thanks,

Patrick

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v2 0/4] optimize async data path
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
                   ` (3 preceding siblings ...)
  2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 4/4] vhost: fix async register/unregister deadlock Patrick Fu
@ 2020-09-29  6:29 ` Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 1/4] vhost: simplify async copy completion Patrick Fu
                     ` (3 more replies)
  2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
  6 siblings, 4 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  6:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

This series applies optimization and fixes to the vhost
async data path.

v2:
 - minor rewordings on commit message
 - minor fix on poll_enenque_completion to correct a packet
   number calculation issue
 - allocate async buffer memory on the same numa with vq
 - add some comments in data path to improve readability

Patrick Fu (4):
  vhost: simplify async copy completion
  vhost: dynamically allocate async memory
  vhost: fix async vector buffer overrun
  vhost: fix async register/unregister deadlock

 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  80 ++++++++------
 lib/librte_vhost/vhost.h           |  14 +--
 lib/librte_vhost/vhost_user.c      |  10 +-
 lib/librte_vhost/virtio_net.c      | 164 +++++++++++++----------------
 5 files changed, 146 insertions(+), 137 deletions(-)

-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v2 1/4] vhost: simplify async copy completion
  2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
@ 2020-09-29  6:29   ` Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 2/4] vhost: dynamically allocate async memory Patrick Fu
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  6:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Current async ops allows check_completed_copies() callback to return
arbitrary number of async iov segments finished from backend async
devices. This design creates complexity for vhost to handle breaking
transfer of a single packet (i.e. transfer completes in the middle
of a async descriptor) and prevents application callbacks from
leveraging hardware capability to offload the work. Thus, this patch
enforces the check_completed_copies() callback to return the number
of async memory descriptors, which is aligned with async transfer
data ops callbacks. vHost async data path are revised to work with
new ops define, which provides a clean and simplified processing.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  20 ++--
 lib/librte_vhost/vhost.h           |   8 +-
 lib/librte_vhost/vhost_user.c      |   6 +-
 lib/librte_vhost/virtio_net.c      | 151 ++++++++++++-----------------
 5 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index cb6284539..c73bd7c99 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -76,13 +76,26 @@ struct rte_vhost_async_channel_ops {
 	 * @param max_packets
 	 *  max number of packets could be completed
 	 * @return
-	 *  number of iov segments completed
+	 *  number of async descs completed
 	 */
 	uint32_t (*check_completed_copies)(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
 };
 
+/**
+ * inflight async packet information
+ */
+struct async_inflight_info {
+	union {
+		uint32_t info;
+		struct {
+			uint16_t descs; /* num of descs inflight */
+			uint16_t segs; /* iov segs inflight */
+		};
+	};
+};
+
 /**
  *  dma channel feature bit definition
  */
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 8f20a0818..eca507836 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -333,8 +333,8 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_split);
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1573,15 +1573,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_pending = rte_malloc(NULL,
 			vq->size * sizeof(uintptr_t),
 			RTE_CACHE_LINE_SIZE);
-	vq->async_pending_info = rte_malloc(NULL,
-			vq->size * sizeof(uint64_t),
+	vq->async_pkts_info = rte_malloc(NULL,
+			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pending_info) {
+	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
 
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
@@ -1635,9 +1635,9 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		vq->async_pkts_pending = NULL;
 	}
 
-	if (vq->async_pending_info) {
-		rte_free(vq->async_pending_info);
-		vq->async_pending_info = NULL;
+	if (vq->async_pkts_info) {
+		rte_free(vq->async_pkts_info);
+		vq->async_pkts_info = NULL;
 	}
 
 	vq->async_ops.transfer_data = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 73a1ed889..155a832c1 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -46,8 +46,6 @@
 
 #define MAX_PKT_BURST 32
 
-#define ASYNC_MAX_POLL_SEG 255
-
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
 #define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
 
@@ -225,12 +223,10 @@ struct vhost_virtqueue {
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
-	#define		ASYNC_PENDING_INFO_N_SFT 16
-	uint64_t	*async_pending_info;
+	struct async_inflight_info *async_pkts_info;
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
-	uint16_t	async_last_seg_n;
+	uint16_t	async_last_pkts_n;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 501218e19..67d2a2d43 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -2006,10 +2006,10 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
 		vq->shadow_used_split = NULL;
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 		vq->async_pkts_pending = NULL;
-		vq->async_pending_info = NULL;
+		vq->async_pkts_info = NULL;
 	}
 
 	rte_free(vq->batch_copy_elems);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index bd9303c8a..68ead9a71 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1473,37 +1473,6 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
-static __rte_always_inline void
-virtio_dev_rx_async_submit_split_err(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
-	uint16_t last_idx, uint16_t shadow_idx)
-{
-	uint16_t start_idx, pkts_idx, vq_size;
-	uint64_t *async_pending_info;
-
-	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
-	vq_size = vq->size;
-	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
-		vq_size, vq->async_pkts_inflight_n);
-
-	while (likely((start_idx & (vq_size - 1)) != pkts_idx)) {
-		uint64_t n_seg =
-			async_pending_info[(start_idx) & (vq_size - 1)] >>
-			ASYNC_PENDING_INFO_N_SFT;
-
-		while (n_seg)
-			n_seg -= vq->async_ops.check_completed_copies(dev->vid,
-				queue_id, 0, 1);
-	}
-
-	vq->async_pkts_inflight_n = 0;
-	vq->batch_copy_nb_elems = 0;
-
-	vq->shadow_used_idx = shadow_idx;
-	vq->last_avail_idx = last_idx;
-}
-
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1512,7 +1481,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
 	uint16_t num_buffers;
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-	uint16_t avail_head, last_idx, shadow_idx;
+	uint16_t avail_head;
 
 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
 	struct iovec *vec_pool = vq->vec_pool;
@@ -1522,11 +1491,11 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *src_it = it_pool;
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
+	uint16_t pkt_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
-	last_idx = vq->last_avail_idx;
-	shadow_idx = vq->shadow_used_idx;
 
 	/*
 	 * The ordering between avail index and
@@ -1565,14 +1534,14 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 		if (src_it->count) {
 			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
 			pkt_burst_idx++;
-			vq->async_pending_info[slot_idx] =
-				num_buffers | (src_it->nr_segs << 16);
+			pkts_info[slot_idx].descs = num_buffers;
+			pkts_info[slot_idx].segs = src_it->nr_segs;
 			src_iovec += src_it->nr_segs;
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
 		} else {
-			vq->async_pending_info[slot_idx] = num_buffers;
+			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
 		}
 
@@ -1586,35 +1555,46 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-				vq->async_pkts_inflight_n +=
-					n_pkts > 0 ? n_pkts : 0;
-				virtio_dev_rx_async_submit_split_err(dev,
-					vq, queue_id, last_idx, shadow_idx);
-				return 0;
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				break;
 			}
 
 			pkt_burst_idx = 0;
-			vq->async_pkts_inflight_n += n_pkts;
 		}
 	}
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid,
 				queue_id, tdes, 0, pkt_burst_idx);
-		if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-			vq->async_pkts_inflight_n += n_pkts > 0 ? n_pkts : 0;
-			virtio_dev_rx_async_submit_split_err(dev, vq, queue_id,
-				last_idx, shadow_idx);
-			return 0;
-		}
-
 		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < (int)pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
 	}
 
 	do_data_copy_enqueue(dev, vq);
 
+	if (unlikely(pkt_err)) {
+		do {
+			if (pkts_info[slot_idx].segs)
+				pkt_err--;
+			vq->last_avail_idx -= pkts_info[slot_idx].descs;
+			vq->shadow_used_idx -= pkts_info[slot_idx].descs;
+			vq->async_pkts_inflight_n--;
+			slot_idx--;
+			pkt_idx--;
+		} while (pkt_err);
+	}
+
 	n_free_slot = vq->size - vq->async_pkts_idx;
 	if (n_free_slot > pkt_idx) {
 		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
@@ -1640,10 +1620,10 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_segs_cpl, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	uint16_t n_inflight;
-	uint64_t *async_pending_info;
+	struct async_inflight_info *pkts_info;
 
 	if (!dev)
 		return 0;
@@ -1657,51 +1637,40 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
 	rte_spinlock_lock(&vq->access_lock);
 
 	n_inflight = vq->async_pkts_inflight_n;
 	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
+	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
 		vq_size, vq->async_pkts_inflight_n);
 
-	n_segs_cpl = vq->async_ops.check_completed_copies(vid, queue_id,
-		0, ASYNC_MAX_POLL_SEG - vq->async_last_seg_n) +
-		vq->async_last_seg_n;
+	if (count > vq->async_last_pkts_n)
+		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	n_pkts_cpl += vq->async_last_pkts_n;
 
 	rte_smp_wmb();
 
 	while (likely((n_pkts_put < count) && n_inflight)) {
-		uint64_t info = async_pending_info[
-			(start_idx + n_pkts_put) & (vq_size - 1)];
-		uint64_t n_segs;
+		uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
+		if (n_pkts_cpl && pkts_info[info_idx].segs)
+			n_pkts_cpl--;
+		else if (!n_pkts_cpl && pkts_info[info_idx].segs)
+			break;
 		n_pkts_put++;
 		n_inflight--;
-		n_descs += info & ASYNC_PENDING_INFO_N_MSK;
-		n_segs = info >> ASYNC_PENDING_INFO_N_SFT;
-
-		if (n_segs) {
-			if (unlikely(n_segs_cpl < n_segs)) {
-				n_pkts_put--;
-				n_inflight++;
-				n_descs -= info & ASYNC_PENDING_INFO_N_MSK;
-				if (n_segs_cpl) {
-					async_pending_info[
-						(start_idx + n_pkts_put) &
-						(vq_size - 1)] =
-					((n_segs - n_segs_cpl) <<
-					 ASYNC_PENDING_INFO_N_SFT) |
-					(info & ASYNC_PENDING_INFO_N_MSK);
-					n_segs_cpl = 0;
-				}
-				break;
-			}
-			n_segs_cpl -= n_segs;
-		}
+		n_descs += pkts_info[info_idx].descs;
 	}
 
-	vq->async_last_seg_n = n_segs_cpl;
+	vq->async_last_pkts_n = n_pkts_cpl;
 
 	if (n_pkts_put) {
 		vq->async_pkts_inflight_n = n_inflight;
@@ -1710,16 +1679,18 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 					n_descs, __ATOMIC_RELEASE);
 			vhost_vring_call_split(dev, vq);
 		}
-	}
 
-	if (start_idx + n_pkts_put <= vq_size) {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			n_pkts_put * sizeof(uintptr_t));
-	} else {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			(vq_size - start_idx) * sizeof(uintptr_t));
-		rte_memcpy(&pkts[vq_size - start_idx], vq->async_pkts_pending,
-			(n_pkts_put - vq_size + start_idx) * sizeof(uintptr_t));
+		if (start_idx + n_pkts_put <= vq_size) {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				n_pkts_put * sizeof(uintptr_t));
+		} else {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				(vq_size - start_idx) * sizeof(uintptr_t));
+			rte_memcpy(&pkts[vq_size - start_idx],
+				vq->async_pkts_pending,
+				(n_pkts_put + start_idx - vq_size) *
+				sizeof(uintptr_t));
+		}
 	}
 
 	rte_spinlock_unlock(&vq->access_lock);
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] vhost: dynamically allocate async memory
  2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-09-29  6:29   ` Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 3/4] vhost: fix async vector buffer overrun Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 4/4] vhost: fix async register/unregister deadlock Patrick Fu
  3 siblings, 0 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  6:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Allocate async internal memory buffer by rte_malloc(), replacing array
declaration inside vq structure. Dynamic allocation can help to save
memory footprint when async path is not registered.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c | 65 +++++++++++++++++++++++++---------------
 lib/librte_vhost/vhost.h |  4 +--
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index eca507836..919a5bf7f 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -324,6 +324,24 @@ cleanup_device(struct virtio_net *dev, int destroy)
 	}
 }
 
+static void
+vhost_free_async_mem(struct vhost_virtqueue *vq)
+{
+	if (vq->async_pkts_pending)
+		rte_free(vq->async_pkts_pending);
+	if (vq->async_pkts_info)
+		rte_free(vq->async_pkts_info);
+	if (vq->it_pool)
+		rte_free(vq->it_pool);
+	if (vq->vec_pool)
+		rte_free(vq->vec_pool);
+
+	vq->async_pkts_pending = NULL;
+	vq->async_pkts_info = NULL;
+	vq->it_pool = NULL;
+	vq->vec_pool = NULL;
+}
+
 void
 free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -331,10 +349,7 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_packed);
 	else {
 		rte_free(vq->shadow_used_split);
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
+		vhost_free_async_mem(vq);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1538,6 +1553,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);
 	struct rte_vhost_async_features f;
+	int node;
 
 	if (dev == NULL || ops == NULL)
 		return -1;
@@ -1570,19 +1586,28 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		goto reg_out;
 	}
 
-	vq->async_pkts_pending = rte_malloc(NULL,
+	if (get_mempolicy(&node, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR)) {
+		VHOST_LOG_CONFIG(ERR,
+			"unable to get numa information in async register. "
+			"allocating async buffer memory on the caller thread node\n");
+		node = SOCKET_ID_ANY;
+	}
+
+	vq->async_pkts_pending = rte_malloc_socket(NULL,
 			vq->size * sizeof(uintptr_t),
-			RTE_CACHE_LINE_SIZE);
-	vq->async_pkts_info = rte_malloc(NULL,
+			RTE_CACHE_LINE_SIZE, node);
+	vq->async_pkts_info = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct async_inflight_info),
-			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
-
+			RTE_CACHE_LINE_SIZE, node);
+	vq->it_pool = rte_malloc_socket(NULL,
+			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
+			RTE_CACHE_LINE_SIZE, node);
+	vq->vec_pool = rte_malloc_socket(NULL,
+			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
+			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->async_pkts_pending || !vq->async_pkts_info ||
+		!vq->it_pool || !vq->vec_pool) {
+		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
 				"(vid %d, qid: %d)\n", vid, queue_id);
@@ -1630,15 +1655,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		goto out;
 	}
 
-	if (vq->async_pkts_pending) {
-		rte_free(vq->async_pkts_pending);
-		vq->async_pkts_pending = NULL;
-	}
-
-	if (vq->async_pkts_info) {
-		rte_free(vq->async_pkts_info);
-		vq->async_pkts_info = NULL;
-	}
+	vhost_free_async_mem(vq);
 
 	vq->async_ops.transfer_data = NULL;
 	vq->async_ops.check_completed_copies = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 155a832c1..f0ee00c73 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -218,8 +218,8 @@ struct vhost_virtqueue {
 	/* operation callbacks for async dma */
 	struct rte_vhost_async_channel_ops	async_ops;
 
-	struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
-	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
+	struct rte_vhost_iov_iter *it_pool;
+	struct iovec *vec_pool;
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] vhost: fix async vector buffer overrun
  2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 1/4] vhost: simplify async copy completion Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 2/4] vhost: dynamically allocate async memory Patrick Fu
@ 2020-09-29  6:29   ` Patrick Fu
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 4/4] vhost: fix async register/unregister deadlock Patrick Fu
  3 siblings, 0 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  6:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Add check on the async vector buffer usage to prevent the buf overrun.
If the unused vector buffer is not sufficient to prepare for next
packet's iov creation, an async transfer will be triggered immediately
to free the vector buffer.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.h      |  2 +-
 lib/librte_vhost/virtio_net.c | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f0ee00c73..34197ce99 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -47,7 +47,7 @@
 #define MAX_PKT_BURST 32
 
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
-#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 4)
 
 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 68ead9a71..6ba64c12e 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1492,6 +1492,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
 	uint16_t pkt_err = 0;
+	uint16_t segs_await = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
@@ -1540,6 +1541,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
+			segs_await += src_it->nr_segs;
 		} else {
 			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
@@ -1547,14 +1549,23 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 		vq->last_avail_idx += num_buffers;
 
+		/*
+		 * conditions to trigger async device transfer:
+		 * - bufferred packet number reaches transfer threshold
+		 * - this is the last packet in the burst enqueue
+		 * - unused async iov number is less than max vhost vector
+		 */
 		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
-				(pkt_idx == count - 1 && pkt_burst_idx)) {
+			(pkt_idx == count - 1 && pkt_burst_idx) ||
+			(VHOST_MAX_ASYNC_VEC / 2 - segs_await <
+			BUF_VECTOR_MAX)) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
 			src_iovec = vec_pool;
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] vhost: fix async register/unregister deadlock
  2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
                     ` (2 preceding siblings ...)
  2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 3/4] vhost: fix async vector buffer overrun Patrick Fu
@ 2020-09-29  6:29   ` Patrick Fu
  3 siblings, 0 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  6:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

When async register/unregister function is invoked in certain vhost
event callbacks (e.g. vring state change), deadlock may occur due to
recursive spinlock acquire. This patch removes unnecessary spinlock
from register API and use trylock() primitive in the unregister API
to avoid deadlock.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c      | 13 +++++++------
 lib/librte_vhost/vhost_user.c |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 919a5bf7f..c247fc900 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -1577,8 +1577,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		ops->transfer_data == NULL))
 		return -1;
 
-	rte_spinlock_lock(&vq->access_lock);
-
 	if (unlikely(vq->async_registered)) {
 		VHOST_LOG_CONFIG(ERR,
 			"async register failed: channel already registered "
@@ -1623,8 +1621,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_registered = true;
 
 reg_out:
-	rte_spinlock_unlock(&vq->access_lock);
-
 	return 0;
 }
 
@@ -1643,10 +1639,15 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		return ret;
 
 	ret = 0;
-	rte_spinlock_lock(&vq->access_lock);
 
 	if (!vq->async_registered)
-		goto out;
+		return ret;
+
+	if (!rte_spinlock_trylock(&vq->access_lock)) {
+		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
+			"virt queue busy.\n");
+		return -1;
+	}
 
 	if (vq->async_pkts_inflight_n) {
 		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 67d2a2d43..c8d74bde6 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -2043,9 +2043,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
 		"set queue enable: %d to qp idx: %d\n",
 		enable, index);
 
-	if (!enable && dev->virtqueue[index]->async_registered) {
+	if (enable && dev->virtqueue[index]->async_registered) {
 		if (dev->virtqueue[index]->async_pkts_inflight_n) {
-			VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
+			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
 			"async inflight packets must be completed first\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v3 0/4] optimize async data path
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
                   ` (4 preceding siblings ...)
  2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
@ 2020-09-29  9:29 ` Patrick Fu
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion Patrick Fu
                     ` (3 more replies)
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
  6 siblings, 4 replies; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  9:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

This series applies optimization and fixes to the vhost
async data path.

v3:
 - fix a typo in vhost error log (checkpatch warning)
 - fix travis-robot ci build warning on aarch64

v2:
 - minor rewordings on commit message
 - minor fix on poll_enenque_completion to correct a packet
   number calculation issue
 - allocate async buffer memory on the same numa with vq
 - add some comments in data path to improve readability

Patrick Fu (4):
  vhost: simplify async copy completion
  vhost: dynamically allocate async memory
  vhost: fix async vector buffer overrun
  vhost: fix async register/unregister deadlock

 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  80 ++++++++------
 lib/librte_vhost/vhost.h           |  14 +--
 lib/librte_vhost/vhost_user.c      |  10 +-
 lib/librte_vhost/virtio_net.c      | 164 +++++++++++++----------------
 5 files changed, 146 insertions(+), 137 deletions(-)

-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion
  2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
@ 2020-09-29  9:29   ` Patrick Fu
  2020-10-05 13:46     ` Maxime Coquelin
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 2/4] vhost: dynamically allocate async memory Patrick Fu
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  9:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Current async ops allows check_completed_copies() callback to return
arbitrary number of async iov segments finished from backend async
devices. This design creates complexity for vhost to handle breaking
transfer of a single packet (i.e. transfer completes in the middle
of a async descriptor) and prevents application callbacks from
leveraging hardware capability to offload the work. Thus, this patch
enforces the check_completed_copies() callback to return the number
of async memory descriptors, which is aligned with async transfer
data ops callbacks. vHost async data path are revised to work with
new ops define, which provides a clean and simplified processing.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  20 ++--
 lib/librte_vhost/vhost.h           |   8 +-
 lib/librte_vhost/vhost_user.c      |   6 +-
 lib/librte_vhost/virtio_net.c      | 151 ++++++++++++-----------------
 5 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index cb6284539..c73bd7c99 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -76,13 +76,26 @@ struct rte_vhost_async_channel_ops {
 	 * @param max_packets
 	 *  max number of packets could be completed
 	 * @return
-	 *  number of iov segments completed
+	 *  number of async descs completed
 	 */
 	uint32_t (*check_completed_copies)(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
 };
 
+/**
+ * inflight async packet information
+ */
+struct async_inflight_info {
+	union {
+		uint32_t info;
+		struct {
+			uint16_t descs; /* num of descs inflight */
+			uint16_t segs; /* iov segs inflight */
+		};
+	};
+};
+
 /**
  *  dma channel feature bit definition
  */
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 8f20a0818..eca507836 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -333,8 +333,8 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_split);
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1573,15 +1573,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_pending = rte_malloc(NULL,
 			vq->size * sizeof(uintptr_t),
 			RTE_CACHE_LINE_SIZE);
-	vq->async_pending_info = rte_malloc(NULL,
-			vq->size * sizeof(uint64_t),
+	vq->async_pkts_info = rte_malloc(NULL,
+			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pending_info) {
+	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
 
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
@@ -1635,9 +1635,9 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		vq->async_pkts_pending = NULL;
 	}
 
-	if (vq->async_pending_info) {
-		rte_free(vq->async_pending_info);
-		vq->async_pending_info = NULL;
+	if (vq->async_pkts_info) {
+		rte_free(vq->async_pkts_info);
+		vq->async_pkts_info = NULL;
 	}
 
 	vq->async_ops.transfer_data = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 73a1ed889..155a832c1 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -46,8 +46,6 @@
 
 #define MAX_PKT_BURST 32
 
-#define ASYNC_MAX_POLL_SEG 255
-
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
 #define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
 
@@ -225,12 +223,10 @@ struct vhost_virtqueue {
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
-	#define		ASYNC_PENDING_INFO_N_SFT 16
-	uint64_t	*async_pending_info;
+	struct async_inflight_info *async_pkts_info;
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
-	uint16_t	async_last_seg_n;
+	uint16_t	async_last_pkts_n;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 501218e19..67d2a2d43 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -2006,10 +2006,10 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
 		vq->shadow_used_split = NULL;
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 		vq->async_pkts_pending = NULL;
-		vq->async_pending_info = NULL;
+		vq->async_pkts_info = NULL;
 	}
 
 	rte_free(vq->batch_copy_elems);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index bd9303c8a..68ead9a71 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1473,37 +1473,6 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
-static __rte_always_inline void
-virtio_dev_rx_async_submit_split_err(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
-	uint16_t last_idx, uint16_t shadow_idx)
-{
-	uint16_t start_idx, pkts_idx, vq_size;
-	uint64_t *async_pending_info;
-
-	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
-	vq_size = vq->size;
-	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
-		vq_size, vq->async_pkts_inflight_n);
-
-	while (likely((start_idx & (vq_size - 1)) != pkts_idx)) {
-		uint64_t n_seg =
-			async_pending_info[(start_idx) & (vq_size - 1)] >>
-			ASYNC_PENDING_INFO_N_SFT;
-
-		while (n_seg)
-			n_seg -= vq->async_ops.check_completed_copies(dev->vid,
-				queue_id, 0, 1);
-	}
-
-	vq->async_pkts_inflight_n = 0;
-	vq->batch_copy_nb_elems = 0;
-
-	vq->shadow_used_idx = shadow_idx;
-	vq->last_avail_idx = last_idx;
-}
-
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1512,7 +1481,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
 	uint16_t num_buffers;
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-	uint16_t avail_head, last_idx, shadow_idx;
+	uint16_t avail_head;
 
 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
 	struct iovec *vec_pool = vq->vec_pool;
@@ -1522,11 +1491,11 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *src_it = it_pool;
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
+	uint16_t pkt_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
-	last_idx = vq->last_avail_idx;
-	shadow_idx = vq->shadow_used_idx;
 
 	/*
 	 * The ordering between avail index and
@@ -1565,14 +1534,14 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 		if (src_it->count) {
 			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
 			pkt_burst_idx++;
-			vq->async_pending_info[slot_idx] =
-				num_buffers | (src_it->nr_segs << 16);
+			pkts_info[slot_idx].descs = num_buffers;
+			pkts_info[slot_idx].segs = src_it->nr_segs;
 			src_iovec += src_it->nr_segs;
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
 		} else {
-			vq->async_pending_info[slot_idx] = num_buffers;
+			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
 		}
 
@@ -1586,35 +1555,46 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-				vq->async_pkts_inflight_n +=
-					n_pkts > 0 ? n_pkts : 0;
-				virtio_dev_rx_async_submit_split_err(dev,
-					vq, queue_id, last_idx, shadow_idx);
-				return 0;
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				break;
 			}
 
 			pkt_burst_idx = 0;
-			vq->async_pkts_inflight_n += n_pkts;
 		}
 	}
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid,
 				queue_id, tdes, 0, pkt_burst_idx);
-		if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-			vq->async_pkts_inflight_n += n_pkts > 0 ? n_pkts : 0;
-			virtio_dev_rx_async_submit_split_err(dev, vq, queue_id,
-				last_idx, shadow_idx);
-			return 0;
-		}
-
 		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < (int)pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
 	}
 
 	do_data_copy_enqueue(dev, vq);
 
+	if (unlikely(pkt_err)) {
+		do {
+			if (pkts_info[slot_idx].segs)
+				pkt_err--;
+			vq->last_avail_idx -= pkts_info[slot_idx].descs;
+			vq->shadow_used_idx -= pkts_info[slot_idx].descs;
+			vq->async_pkts_inflight_n--;
+			slot_idx--;
+			pkt_idx--;
+		} while (pkt_err);
+	}
+
 	n_free_slot = vq->size - vq->async_pkts_idx;
 	if (n_free_slot > pkt_idx) {
 		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
@@ -1640,10 +1620,10 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_segs_cpl, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	uint16_t n_inflight;
-	uint64_t *async_pending_info;
+	struct async_inflight_info *pkts_info;
 
 	if (!dev)
 		return 0;
@@ -1657,51 +1637,40 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
 	rte_spinlock_lock(&vq->access_lock);
 
 	n_inflight = vq->async_pkts_inflight_n;
 	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
+	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
 		vq_size, vq->async_pkts_inflight_n);
 
-	n_segs_cpl = vq->async_ops.check_completed_copies(vid, queue_id,
-		0, ASYNC_MAX_POLL_SEG - vq->async_last_seg_n) +
-		vq->async_last_seg_n;
+	if (count > vq->async_last_pkts_n)
+		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	n_pkts_cpl += vq->async_last_pkts_n;
 
 	rte_smp_wmb();
 
 	while (likely((n_pkts_put < count) && n_inflight)) {
-		uint64_t info = async_pending_info[
-			(start_idx + n_pkts_put) & (vq_size - 1)];
-		uint64_t n_segs;
+		uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
+		if (n_pkts_cpl && pkts_info[info_idx].segs)
+			n_pkts_cpl--;
+		else if (!n_pkts_cpl && pkts_info[info_idx].segs)
+			break;
 		n_pkts_put++;
 		n_inflight--;
-		n_descs += info & ASYNC_PENDING_INFO_N_MSK;
-		n_segs = info >> ASYNC_PENDING_INFO_N_SFT;
-
-		if (n_segs) {
-			if (unlikely(n_segs_cpl < n_segs)) {
-				n_pkts_put--;
-				n_inflight++;
-				n_descs -= info & ASYNC_PENDING_INFO_N_MSK;
-				if (n_segs_cpl) {
-					async_pending_info[
-						(start_idx + n_pkts_put) &
-						(vq_size - 1)] =
-					((n_segs - n_segs_cpl) <<
-					 ASYNC_PENDING_INFO_N_SFT) |
-					(info & ASYNC_PENDING_INFO_N_MSK);
-					n_segs_cpl = 0;
-				}
-				break;
-			}
-			n_segs_cpl -= n_segs;
-		}
+		n_descs += pkts_info[info_idx].descs;
 	}
 
-	vq->async_last_seg_n = n_segs_cpl;
+	vq->async_last_pkts_n = n_pkts_cpl;
 
 	if (n_pkts_put) {
 		vq->async_pkts_inflight_n = n_inflight;
@@ -1710,16 +1679,18 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 					n_descs, __ATOMIC_RELEASE);
 			vhost_vring_call_split(dev, vq);
 		}
-	}
 
-	if (start_idx + n_pkts_put <= vq_size) {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			n_pkts_put * sizeof(uintptr_t));
-	} else {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			(vq_size - start_idx) * sizeof(uintptr_t));
-		rte_memcpy(&pkts[vq_size - start_idx], vq->async_pkts_pending,
-			(n_pkts_put - vq_size + start_idx) * sizeof(uintptr_t));
+		if (start_idx + n_pkts_put <= vq_size) {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				n_pkts_put * sizeof(uintptr_t));
+		} else {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				(vq_size - start_idx) * sizeof(uintptr_t));
+			rte_memcpy(&pkts[vq_size - start_idx],
+				vq->async_pkts_pending,
+				(n_pkts_put + start_idx - vq_size) *
+				sizeof(uintptr_t));
+		}
 	}
 
 	rte_spinlock_unlock(&vq->access_lock);
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v3 2/4] vhost: dynamically allocate async memory
  2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-09-29  9:29   ` Patrick Fu
  2020-10-05 13:50     ` Maxime Coquelin
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 3/4] vhost: fix async vector buffer overrun Patrick Fu
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock Patrick Fu
  3 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  9:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Allocate async internal memory buffer by rte_malloc(), replacing array
declaration inside vq structure. Dynamic allocation can help to save
memory footprint when async path is not registered.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c | 69 ++++++++++++++++++++++++++--------------
 lib/librte_vhost/vhost.h |  4 +--
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index eca507836..05b578c2f 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -324,6 +324,24 @@ cleanup_device(struct virtio_net *dev, int destroy)
 	}
 }
 
+static void
+vhost_free_async_mem(struct vhost_virtqueue *vq)
+{
+	if (vq->async_pkts_pending)
+		rte_free(vq->async_pkts_pending);
+	if (vq->async_pkts_info)
+		rte_free(vq->async_pkts_info);
+	if (vq->it_pool)
+		rte_free(vq->it_pool);
+	if (vq->vec_pool)
+		rte_free(vq->vec_pool);
+
+	vq->async_pkts_pending = NULL;
+	vq->async_pkts_info = NULL;
+	vq->it_pool = NULL;
+	vq->vec_pool = NULL;
+}
+
 void
 free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -331,10 +349,7 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_packed);
 	else {
 		rte_free(vq->shadow_used_split);
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
+		vhost_free_async_mem(vq);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1538,6 +1553,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);
 	struct rte_vhost_async_features f;
+	int node;
 
 	if (dev == NULL || ops == NULL)
 		return -1;
@@ -1570,19 +1586,32 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		goto reg_out;
 	}
 
-	vq->async_pkts_pending = rte_malloc(NULL,
+#ifdef RTE_LIBRTE_VHOST_NUMA
+	if (get_mempolicy(&node, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR)) {
+		VHOST_LOG_CONFIG(ERR,
+			"unable to get numa information in async register. "
+			"allocating async buffer memory on the caller thread node\n");
+		node = SOCKET_ID_ANY;
+	}
+#else
+	node = SOCKET_ID_ANY;
+#endif
+
+	vq->async_pkts_pending = rte_malloc_socket(NULL,
 			vq->size * sizeof(uintptr_t),
-			RTE_CACHE_LINE_SIZE);
-	vq->async_pkts_info = rte_malloc(NULL,
+			RTE_CACHE_LINE_SIZE, node);
+	vq->async_pkts_info = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct async_inflight_info),
-			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
-
+			RTE_CACHE_LINE_SIZE, node);
+	vq->it_pool = rte_malloc_socket(NULL,
+			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
+			RTE_CACHE_LINE_SIZE, node);
+	vq->vec_pool = rte_malloc_socket(NULL,
+			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
+			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->async_pkts_pending || !vq->async_pkts_info ||
+		!vq->it_pool || !vq->vec_pool) {
+		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
 				"(vid %d, qid: %d)\n", vid, queue_id);
@@ -1630,15 +1659,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		goto out;
 	}
 
-	if (vq->async_pkts_pending) {
-		rte_free(vq->async_pkts_pending);
-		vq->async_pkts_pending = NULL;
-	}
-
-	if (vq->async_pkts_info) {
-		rte_free(vq->async_pkts_info);
-		vq->async_pkts_info = NULL;
-	}
+	vhost_free_async_mem(vq);
 
 	vq->async_ops.transfer_data = NULL;
 	vq->async_ops.check_completed_copies = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 155a832c1..f0ee00c73 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -218,8 +218,8 @@ struct vhost_virtqueue {
 	/* operation callbacks for async dma */
 	struct rte_vhost_async_channel_ops	async_ops;
 
-	struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
-	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
+	struct rte_vhost_iov_iter *it_pool;
+	struct iovec *vec_pool;
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v3 3/4] vhost: fix async vector buffer overrun
  2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion Patrick Fu
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 2/4] vhost: dynamically allocate async memory Patrick Fu
@ 2020-09-29  9:29   ` Patrick Fu
  2020-10-05 14:19     ` Maxime Coquelin
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock Patrick Fu
  3 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  9:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Add check on the async vector buffer usage to prevent the buf overrun.
If the unused vector buffer is not sufficient to prepare for next
packet's iov creation, an async transfer will be triggered immediately
to free the vector buffer.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.h      |  2 +-
 lib/librte_vhost/virtio_net.c | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f0ee00c73..34197ce99 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -47,7 +47,7 @@
 #define MAX_PKT_BURST 32
 
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
-#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 4)
 
 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 68ead9a71..60cd5e2b1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1492,6 +1492,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
 	uint16_t pkt_err = 0;
+	uint16_t segs_await = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
@@ -1540,6 +1541,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
+			segs_await += src_it->nr_segs;
 		} else {
 			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
@@ -1547,14 +1549,23 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 		vq->last_avail_idx += num_buffers;
 
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - this is the last packet in the burst enqueue
+		 * - unused async iov number is less than max vhost vector
+		 */
 		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
-				(pkt_idx == count - 1 && pkt_burst_idx)) {
+			(pkt_idx == count - 1 && pkt_burst_idx) ||
+			(VHOST_MAX_ASYNC_VEC / 2 - segs_await <
+			BUF_VECTOR_MAX)) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
 			src_iovec = vec_pool;
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock
  2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
                     ` (2 preceding siblings ...)
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 3/4] vhost: fix async vector buffer overrun Patrick Fu
@ 2020-09-29  9:29   ` Patrick Fu
  2020-10-05 14:25     ` Maxime Coquelin
  3 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-09-29  9:29 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

When async register/unregister function is invoked in certain vhost
event callbacks (e.g. vring state change), deadlock may occur due to
recursive spinlock acquire. This patch removes unnecessary spinlock
from register API and use trylock() primitive in the unregister API
to avoid deadlock.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c      | 13 +++++++------
 lib/librte_vhost/vhost_user.c |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 05b578c2f..ba504a00a 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -1577,8 +1577,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		ops->transfer_data == NULL))
 		return -1;
 
-	rte_spinlock_lock(&vq->access_lock);
-
 	if (unlikely(vq->async_registered)) {
 		VHOST_LOG_CONFIG(ERR,
 			"async register failed: channel already registered "
@@ -1627,8 +1625,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_registered = true;
 
 reg_out:
-	rte_spinlock_unlock(&vq->access_lock);
-
 	return 0;
 }
 
@@ -1647,10 +1643,15 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		return ret;
 
 	ret = 0;
-	rte_spinlock_lock(&vq->access_lock);
 
 	if (!vq->async_registered)
-		goto out;
+		return ret;
+
+	if (!rte_spinlock_trylock(&vq->access_lock)) {
+		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
+			"virt queue busy.\n");
+		return -1;
+	}
 
 	if (vq->async_pkts_inflight_n) {
 		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 67d2a2d43..c8d74bde6 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -2043,9 +2043,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
 		"set queue enable: %d to qp idx: %d\n",
 		enable, index);
 
-	if (!enable && dev->virtqueue[index]->async_registered) {
+	if (enable && dev->virtqueue[index]->async_registered) {
 		if (dev->virtqueue[index]->async_pkts_inflight_n) {
-			VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
+			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
 			"async inflight packets must be completed first\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-10-05 13:46     ` Maxime Coquelin
  2020-10-09 11:16       ` Fu, Patrick
  0 siblings, 1 reply; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-05 13:46 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 9/29/20 11:29 AM, Patrick Fu wrote:
> Current async ops allows check_completed_copies() callback to return
> arbitrary number of async iov segments finished from backend async
> devices. This design creates complexity for vhost to handle breaking
> transfer of a single packet (i.e. transfer completes in the middle
> of a async descriptor) and prevents application callbacks from
> leveraging hardware capability to offload the work. Thus, this patch
> enforces the check_completed_copies() callback to return the number
> of async memory descriptors, which is aligned with async transfer
> data ops callbacks. vHost async data path are revised to work with
> new ops define, which provides a clean and simplified processing.
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |  15 ++-
>  lib/librte_vhost/vhost.c           |  20 ++--
>  lib/librte_vhost/vhost.h           |   8 +-
>  lib/librte_vhost/vhost_user.c      |   6 +-
>  lib/librte_vhost/virtio_net.c      | 151 ++++++++++++-----------------
>  5 files changed, 90 insertions(+), 110 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
> index cb6284539..c73bd7c99 100644
> --- a/lib/librte_vhost/rte_vhost_async.h
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -76,13 +76,26 @@ struct rte_vhost_async_channel_ops {
>  	 * @param max_packets
>  	 *  max number of packets could be completed
>  	 * @return
> -	 *  number of iov segments completed
> +	 *  number of async descs completed
>  	 */
>  	uint32_t (*check_completed_copies)(int vid, uint16_t queue_id,
>  		struct rte_vhost_async_status *opaque_data,
>  		uint16_t max_packets);
>  };
>  
> +/**
> + * inflight async packet information
> + */
> +struct async_inflight_info {
> +	union {
> +		uint32_t info;
> +		struct {
> +			uint16_t descs; /* num of descs inflight */
> +			uint16_t segs; /* iov segs inflight */
> +		};
> +	};

I think you can removing the union.

> +};
> +
>  /**
>   *  dma channel feature bit definition
>   */
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 8f20a0818..eca507836 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -333,8 +333,8 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  		rte_free(vq->shadow_used_split);
>  		if (vq->async_pkts_pending)
>  			rte_free(vq->async_pkts_pending);
> -		if (vq->async_pending_info)
> -			rte_free(vq->async_pending_info);
> +		if (vq->async_pkts_info)
> +			rte_free(vq->async_pkts_info);
>  	}
>  	rte_free(vq->batch_copy_elems);
>  	rte_mempool_free(vq->iotlb_pool);
> @@ -1573,15 +1573,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  	vq->async_pkts_pending = rte_malloc(NULL,
>  			vq->size * sizeof(uintptr_t),
>  			RTE_CACHE_LINE_SIZE);
> -	vq->async_pending_info = rte_malloc(NULL,
> -			vq->size * sizeof(uint64_t),
> +	vq->async_pkts_info = rte_malloc(NULL,
> +			vq->size * sizeof(struct async_inflight_info),
>  			RTE_CACHE_LINE_SIZE);
> -	if (!vq->async_pkts_pending || !vq->async_pending_info) {
> +	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
>  		if (vq->async_pkts_pending)
>  			rte_free(vq->async_pkts_pending);
>  
> -		if (vq->async_pending_info)
> -			rte_free(vq->async_pending_info);
> +		if (vq->async_pkts_info)
> +			rte_free(vq->async_pkts_info);
>  
>  		VHOST_LOG_CONFIG(ERR,
>  				"async register failed: cannot allocate memory for vq data "
> @@ -1635,9 +1635,9 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>  		vq->async_pkts_pending = NULL;
>  	}
>  
> -	if (vq->async_pending_info) {
> -		rte_free(vq->async_pending_info);
> -		vq->async_pending_info = NULL;
> +	if (vq->async_pkts_info) {
> +		rte_free(vq->async_pkts_info);
> +		vq->async_pkts_info = NULL;
>  	}
>  
>  	vq->async_ops.transfer_data = NULL;
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 73a1ed889..155a832c1 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -46,8 +46,6 @@
>  
>  #define MAX_PKT_BURST 32
>  
> -#define ASYNC_MAX_POLL_SEG 255
> -
>  #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
>  #define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
>  
> @@ -225,12 +223,10 @@ struct vhost_virtqueue {
>  
>  	/* async data transfer status */
>  	uintptr_t	**async_pkts_pending;
> -	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
> -	#define		ASYNC_PENDING_INFO_N_SFT 16
> -	uint64_t	*async_pending_info;
> +	struct async_inflight_info *async_pkts_info;
>  	uint16_t	async_pkts_idx;
>  	uint16_t	async_pkts_inflight_n;
> -	uint16_t	async_last_seg_n;
> +	uint16_t	async_last_pkts_n;
>  
>  	/* vq async features */
>  	bool		async_inorder;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index 501218e19..67d2a2d43 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -2006,10 +2006,10 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
>  		vq->shadow_used_split = NULL;
>  		if (vq->async_pkts_pending)
>  			rte_free(vq->async_pkts_pending);
> -		if (vq->async_pending_info)
> -			rte_free(vq->async_pending_info);
> +		if (vq->async_pkts_info)
> +			rte_free(vq->async_pkts_info);
>  		vq->async_pkts_pending = NULL;
> -		vq->async_pending_info = NULL;
> +		vq->async_pkts_info = NULL;
>  	}
>  
>  	rte_free(vq->batch_copy_elems);
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index bd9303c8a..68ead9a71 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1473,37 +1473,6 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
>  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
>  }
>  
> -static __rte_always_inline void
> -virtio_dev_rx_async_submit_split_err(struct virtio_net *dev,
> -	struct vhost_virtqueue *vq, uint16_t queue_id,
> -	uint16_t last_idx, uint16_t shadow_idx)
> -{
> -	uint16_t start_idx, pkts_idx, vq_size;
> -	uint64_t *async_pending_info;
> -
> -	pkts_idx = vq->async_pkts_idx;
> -	async_pending_info = vq->async_pending_info;
> -	vq_size = vq->size;
> -	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
> -		vq_size, vq->async_pkts_inflight_n);
> -
> -	while (likely((start_idx & (vq_size - 1)) != pkts_idx)) {
> -		uint64_t n_seg =
> -			async_pending_info[(start_idx) & (vq_size - 1)] >>
> -			ASYNC_PENDING_INFO_N_SFT;
> -
> -		while (n_seg)
> -			n_seg -= vq->async_ops.check_completed_copies(dev->vid,
> -				queue_id, 0, 1);
> -	}
> -
> -	vq->async_pkts_inflight_n = 0;
> -	vq->batch_copy_nb_elems = 0;
> -
> -	vq->shadow_used_idx = shadow_idx;
> -	vq->last_avail_idx = last_idx;
> -}
> -
>  static __rte_noinline uint32_t
>  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct vhost_virtqueue *vq, uint16_t queue_id,
> @@ -1512,7 +1481,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
>  	uint16_t num_buffers;
>  	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> -	uint16_t avail_head, last_idx, shadow_idx;
> +	uint16_t avail_head;
>  
>  	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
>  	struct iovec *vec_pool = vq->vec_pool;
> @@ -1522,11 +1491,11 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  	struct rte_vhost_iov_iter *src_it = it_pool;
>  	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
>  	uint16_t n_free_slot, slot_idx;
> +	uint16_t pkt_err = 0;
> +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
>  	int n_pkts = 0;
>  
>  	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
> -	last_idx = vq->last_avail_idx;
> -	shadow_idx = vq->shadow_used_idx;
>  
>  	/*
>  	 * The ordering between avail index and
> @@ -1565,14 +1534,14 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  		if (src_it->count) {
>  			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
>  			pkt_burst_idx++;
> -			vq->async_pending_info[slot_idx] =
> -				num_buffers | (src_it->nr_segs << 16);
> +			pkts_info[slot_idx].descs = num_buffers;
> +			pkts_info[slot_idx].segs = src_it->nr_segs;
>  			src_iovec += src_it->nr_segs;
>  			dst_iovec += dst_it->nr_segs;
>  			src_it += 2;
>  			dst_it += 2;
>  		} else {
> -			vq->async_pending_info[slot_idx] = num_buffers;
> +			pkts_info[slot_idx].info = num_buffers;
>  			vq->async_pkts_inflight_n++;
>  		}
>  
> @@ -1586,35 +1555,46 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
>  			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
>  			src_it = it_pool;
>  			dst_it = it_pool + 1;
> +			vq->async_pkts_inflight_n += n_pkts;
>  
>  			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
> -				vq->async_pkts_inflight_n +=
> -					n_pkts > 0 ? n_pkts : 0;
> -				virtio_dev_rx_async_submit_split_err(dev,
> -					vq, queue_id, last_idx, shadow_idx);
> -				return 0;
> +				/*
> +				 * log error packets number here and do actual
> +				 * error processing when applications poll
> +				 * completion
> +				 */
> +				pkt_err = pkt_burst_idx - n_pkts;
> +				pkt_burst_idx = 0;
> +				break;
>  			}
>  
>  			pkt_burst_idx = 0;
> -			vq->async_pkts_inflight_n += n_pkts;
>  		}
>  	}
>  
>  	if (pkt_burst_idx) {
>  		n_pkts = vq->async_ops.transfer_data(dev->vid,
>  				queue_id, tdes, 0, pkt_burst_idx);
> -		if (unlikely(n_pkts < (int)pkt_burst_idx)) {
> -			vq->async_pkts_inflight_n += n_pkts > 0 ? n_pkts : 0;
> -			virtio_dev_rx_async_submit_split_err(dev, vq, queue_id,
> -				last_idx, shadow_idx);
> -			return 0;
> -		}
> -
>  		vq->async_pkts_inflight_n += n_pkts;
> +
> +		if (unlikely(n_pkts < (int)pkt_burst_idx))
> +			pkt_err = pkt_burst_idx - n_pkts;
>  	}
>  
>  	do_data_copy_enqueue(dev, vq);
>  
> +	if (unlikely(pkt_err)) {
> +		do {
> +			if (pkts_info[slot_idx].segs)
> +				pkt_err--;
> +			vq->last_avail_idx -= pkts_info[slot_idx].descs;
> +			vq->shadow_used_idx -= pkts_info[slot_idx].descs;
> +			vq->async_pkts_inflight_n--;
> +			slot_idx--;
> +			pkt_idx--;
> +		} while (pkt_err);

That does not sound really robust.
Maybe you could exit also on slot_idx < 0 to avoid out of range
accesses?

} while (pkt_err && slot_idx >= 0)

> +	}
> +
>  	n_free_slot = vq->size - vq->async_pkts_idx;
>  	if (n_free_slot > pkt_idx) {
>  		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
> @@ -1640,10 +1620,10 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  {
>  	struct virtio_net *dev = get_device(vid);
>  	struct vhost_virtqueue *vq;
> -	uint16_t n_segs_cpl, n_pkts_put = 0, n_descs = 0;
> +	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
>  	uint16_t start_idx, pkts_idx, vq_size;
>  	uint16_t n_inflight;
> -	uint64_t *async_pending_info;
> +	struct async_inflight_info *pkts_info;
>  
>  	if (!dev)
>  		return 0;
> @@ -1657,51 +1637,40 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  
>  	vq = dev->virtqueue[queue_id];
>  
> +	if (unlikely(!vq->async_registered)) {
> +		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
> +			dev->vid, __func__, queue_id);
> +		return 0;
> +	}
> +
>  	rte_spinlock_lock(&vq->access_lock);
>  
>  	n_inflight = vq->async_pkts_inflight_n;
>  	pkts_idx = vq->async_pkts_idx;
> -	async_pending_info = vq->async_pending_info;
> +	pkts_info = vq->async_pkts_info;
>  	vq_size = vq->size;
>  	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
>  		vq_size, vq->async_pkts_inflight_n);
>  
> -	n_segs_cpl = vq->async_ops.check_completed_copies(vid, queue_id,
> -		0, ASYNC_MAX_POLL_SEG - vq->async_last_seg_n) +
> -		vq->async_last_seg_n;
> +	if (count > vq->async_last_pkts_n)
> +		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
> +			queue_id, 0, count - vq->async_last_pkts_n);
> +	n_pkts_cpl += vq->async_last_pkts_n;
>  
>  	rte_smp_wmb();
>  
>  	while (likely((n_pkts_put < count) && n_inflight)) {
> -		uint64_t info = async_pending_info[
> -			(start_idx + n_pkts_put) & (vq_size - 1)];
> -		uint64_t n_segs;
> +		uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
> +		if (n_pkts_cpl && pkts_info[info_idx].segs)
> +			n_pkts_cpl--;
> +		else if (!n_pkts_cpl && pkts_info[info_idx].segs)
> +			break;
>  		n_pkts_put++;
>  		n_inflight--;
> -		n_descs += info & ASYNC_PENDING_INFO_N_MSK;
> -		n_segs = info >> ASYNC_PENDING_INFO_N_SFT;
> -
> -		if (n_segs) {
> -			if (unlikely(n_segs_cpl < n_segs)) {
> -				n_pkts_put--;
> -				n_inflight++;
> -				n_descs -= info & ASYNC_PENDING_INFO_N_MSK;
> -				if (n_segs_cpl) {
> -					async_pending_info[
> -						(start_idx + n_pkts_put) &
> -						(vq_size - 1)] =
> -					((n_segs - n_segs_cpl) <<
> -					 ASYNC_PENDING_INFO_N_SFT) |
> -					(info & ASYNC_PENDING_INFO_N_MSK);
> -					n_segs_cpl = 0;
> -				}
> -				break;
> -			}
> -			n_segs_cpl -= n_segs;
> -		}
> +		n_descs += pkts_info[info_idx].descs;
>  	}
>  
> -	vq->async_last_seg_n = n_segs_cpl;
> +	vq->async_last_pkts_n = n_pkts_cpl;
>  
>  	if (n_pkts_put) {
>  		vq->async_pkts_inflight_n = n_inflight;
> @@ -1710,16 +1679,18 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
>  					n_descs, __ATOMIC_RELEASE);
>  			vhost_vring_call_split(dev, vq);
>  		}
> -	}
>  
> -	if (start_idx + n_pkts_put <= vq_size) {
> -		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
> -			n_pkts_put * sizeof(uintptr_t));
> -	} else {
> -		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
> -			(vq_size - start_idx) * sizeof(uintptr_t));
> -		rte_memcpy(&pkts[vq_size - start_idx], vq->async_pkts_pending,
> -			(n_pkts_put - vq_size + start_idx) * sizeof(uintptr_t));
> +		if (start_idx + n_pkts_put <= vq_size) {
> +			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
> +				n_pkts_put * sizeof(uintptr_t));
> +		} else {
> +			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
> +				(vq_size - start_idx) * sizeof(uintptr_t));
> +			rte_memcpy(&pkts[vq_size - start_idx],
> +				vq->async_pkts_pending,
> +				(n_pkts_put + start_idx - vq_size) *
> +				sizeof(uintptr_t));
> +		}
>  	}
>  
>  	rte_spinlock_unlock(&vq->access_lock);
> 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] vhost: dynamically allocate async memory
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 2/4] vhost: dynamically allocate async memory Patrick Fu
@ 2020-10-05 13:50     ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-05 13:50 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 9/29/20 11:29 AM, Patrick Fu wrote:
> Allocate async internal memory buffer by rte_malloc(), replacing array
> declaration inside vq structure. Dynamic allocation can help to save
> memory footprint when async path is not registered.
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.c | 69 ++++++++++++++++++++++++++--------------
>  lib/librte_vhost/vhost.h |  4 +--
>  2 files changed, 47 insertions(+), 26 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/4] vhost: fix async vector buffer overrun
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 3/4] vhost: fix async vector buffer overrun Patrick Fu
@ 2020-10-05 14:19     ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-05 14:19 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 9/29/20 11:29 AM, Patrick Fu wrote:
> Add check on the async vector buffer usage to prevent the buf overrun.
> If the unused vector buffer is not sufficient to prepare for next
> packet's iov creation, an async transfer will be triggered immediately
> to free the vector buffer.
> 
> Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.h      |  2 +-
>  lib/librte_vhost/virtio_net.c | 13 ++++++++++++-
>  2 files changed, 13 insertions(+), 2 deletions(-)

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock
  2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock Patrick Fu
@ 2020-10-05 14:25     ` Maxime Coquelin
  2020-10-09 10:54       ` Fu, Patrick
  0 siblings, 1 reply; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-05 14:25 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 9/29/20 11:29 AM, Patrick Fu wrote:
> When async register/unregister function is invoked in certain vhost
> event callbacks (e.g. vring state change), deadlock may occur due to
> recursive spinlock acquire. This patch removes unnecessary spinlock
> from register API and use trylock() primitive in the unregister API
> to avoid deadlock.

I am not convinced it is unnecessary in every cases.


> Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.c      | 13 +++++++------
>  lib/librte_vhost/vhost_user.c |  4 ++--
>  2 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 05b578c2f..ba504a00a 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -1577,8 +1577,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  		ops->transfer_data == NULL))
>  		return -1;
>  
> -	rte_spinlock_lock(&vq->access_lock);
> -

What if a virtqueue is being destroyed while this function is called?

>  	if (unlikely(vq->async_registered)) {
>  		VHOST_LOG_CONFIG(ERR,
>  			"async register failed: channel already registered "
> @@ -1627,8 +1625,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>  	vq->async_registered = true;
>  
>  reg_out:
> -	rte_spinlock_unlock(&vq->access_lock);
> -
>  	return 0;
>  }
>  
> @@ -1647,10 +1643,15 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>  		return ret;
>  
>  	ret = 0;
> -	rte_spinlock_lock(&vq->access_lock);
>  
>  	if (!vq->async_registered)
> -		goto out;
> +		return ret;
> +
> +	if (!rte_spinlock_trylock(&vq->access_lock)) {
> +		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
> +			"virt queue busy.\n");
> +		return -1;
> +	}
>  
>  	if (vq->async_pkts_inflight_n) {
>  		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index 67d2a2d43..c8d74bde6 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -2043,9 +2043,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
>  		"set queue enable: %d to qp idx: %d\n",
>  		enable, index);
>  
> -	if (!enable && dev->virtqueue[index]->async_registered) {
> +	if (enable && dev->virtqueue[index]->async_registered) {
>  		if (dev->virtqueue[index]->async_pkts_inflight_n) {
> -			VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
> +			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
>  			"async inflight packets must be completed first\n");
>  			return RTE_VHOST_MSG_RESULT_ERR;
>  		}
> 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock
  2020-10-05 14:25     ` Maxime Coquelin
@ 2020-10-09 10:54       ` Fu, Patrick
  0 siblings, 0 replies; 36+ messages in thread
From: Fu, Patrick @ 2020-10-09 10:54 UTC (permalink / raw)
  To: Maxime Coquelin, dev, Xia, Chenbo; +Cc: Wang, Zhihong, Jiang, Cheng1



> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, October 5, 2020 10:26 PM
> To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>
> Subject: Re: [PATCH v3 4/4] vhost: fix async register/unregister deadlock
> 
> 
> 
> On 9/29/20 11:29 AM, Patrick Fu wrote:
> > When async register/unregister function is invoked in certain vhost
> > event callbacks (e.g. vring state change), deadlock may occur due to
> > recursive spinlock acquire. This patch removes unnecessary spinlock
> > from register API and use trylock() primitive in the unregister API
> > to avoid deadlock.
> 
> I am not convinced it is unnecessary in every cases.
> 
> 
> > Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")
> >
> > Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> > ---
> >  lib/librte_vhost/vhost.c      | 13 +++++++------
> >  lib/librte_vhost/vhost_user.c |  4 ++--
> >  2 files changed, 9 insertions(+), 8 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > index 05b578c2f..ba504a00a 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -1577,8 +1577,6 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
> >  		ops->transfer_data == NULL))
> >  		return -1;
> >
> > -	rte_spinlock_lock(&vq->access_lock);
> > -
> 
> What if a virtqueue is being destroyed while this function is called?
> 

Yes, this is a scenario that access_lock may help. I will add spinlock back in v4.

Thanks,

Patrick

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion
  2020-10-05 13:46     ` Maxime Coquelin
@ 2020-10-09 11:16       ` Fu, Patrick
  0 siblings, 0 replies; 36+ messages in thread
From: Fu, Patrick @ 2020-10-09 11:16 UTC (permalink / raw)
  To: Maxime Coquelin, dev, Xia, Chenbo; +Cc: Wang, Zhihong, Jiang, Cheng1

> > On 9/29/20 11:29 AM, Patrick Fu wrote:
> > Current async ops allows check_completed_copies() callback to return
> > arbitrary number of async iov segments finished from backend async
> > devices. This design creates complexity for vhost to handle breaking
> > transfer of a single packet (i.e. transfer completes in the middle
> > of a async descriptor) and prevents application callbacks from
> > leveraging hardware capability to offload the work. Thus, this patch
> > enforces the check_completed_copies() callback to return the number
> > of async memory descriptors, which is aligned with async transfer
> > data ops callbacks. vHost async data path are revised to work with
> > new ops define, which provides a clean and simplified processing.
> >
> > Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> > ---
> >  lib/librte_vhost/rte_vhost_async.h |  15 ++-
> >  lib/librte_vhost/vhost.c           |  20 ++--
> >  lib/librte_vhost/vhost.h           |   8 +-
> >  lib/librte_vhost/vhost_user.c      |   6 +-
> >  lib/librte_vhost/virtio_net.c      | 151 ++++++++++++-----------------
> >  5 files changed, 90 insertions(+), 110 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> b/lib/librte_vhost/rte_vhost_async.h
> > index cb6284539..c73bd7c99 100644
> > --- a/lib/librte_vhost/rte_vhost_async.h
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -76,13 +76,26 @@ struct rte_vhost_async_channel_ops {
> >  	 * @param max_packets
> >  	 *  max number of packets could be completed
> >  	 * @return
> > -	 *  number of iov segments completed
> > +	 *  number of async descs completed
> >  	 */
> >  	uint32_t (*check_completed_copies)(int vid, uint16_t queue_id,
> >  		struct rte_vhost_async_status *opaque_data,
> >  		uint16_t max_packets);
> >  };
> >
> > +/**
> > + * inflight async packet information
> > + */
> > +struct async_inflight_info {
> > +	union {
> > +		uint32_t info;
> > +		struct {
> > +			uint16_t descs; /* num of descs inflight */
> > +			uint16_t segs; /* iov segs inflight */
> > +		};
> > +	};
> 
> I think you can removing the union.
> 
There are some cases in the code that we want to set descs=N and set segs=0. In such cases, reference info=N directly will be faster than setting descs & segs separately.  So I would prefer to keep the union.

> > +};
> > +
> >  /**
> >   *  dma channel feature bit definition
> >   */
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > index 8f20a0818..eca507836 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -333,8 +333,8 @@ free_vq(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
> >  		rte_free(vq->shadow_used_split);
> >  		if (vq->async_pkts_pending)
> >  			rte_free(vq->async_pkts_pending);
> > -		if (vq->async_pending_info)
> > -			rte_free(vq->async_pending_info);
> > +		if (vq->async_pkts_info)
> > +			rte_free(vq->async_pkts_info);
> >  	}
> >  	rte_free(vq->batch_copy_elems);
> >  	rte_mempool_free(vq->iotlb_pool);
> > @@ -1573,15 +1573,15 @@ int rte_vhost_async_channel_register(int vid,
> uint16_t queue_id,
> >  	vq->async_pkts_pending = rte_malloc(NULL,
> >  			vq->size * sizeof(uintptr_t),
> >  			RTE_CACHE_LINE_SIZE);
> > -	vq->async_pending_info = rte_malloc(NULL,
> > -			vq->size * sizeof(uint64_t),
> > +	vq->async_pkts_info = rte_malloc(NULL,
> > +			vq->size * sizeof(struct async_inflight_info),
> >  			RTE_CACHE_LINE_SIZE);
> > -	if (!vq->async_pkts_pending || !vq->async_pending_info) {
> > +	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
> >  		if (vq->async_pkts_pending)
> >  			rte_free(vq->async_pkts_pending);
> >
> > -		if (vq->async_pending_info)
> > -			rte_free(vq->async_pending_info);
> > +		if (vq->async_pkts_info)
> > +			rte_free(vq->async_pkts_info);
> >
> >  		VHOST_LOG_CONFIG(ERR,
> >  				"async register failed: cannot allocate
> memory for vq data "
> > @@ -1635,9 +1635,9 @@ int rte_vhost_async_channel_unregister(int vid,
> uint16_t queue_id)
> >  		vq->async_pkts_pending = NULL;
> >  	}
> >
> > -	if (vq->async_pending_info) {
> > -		rte_free(vq->async_pending_info);
> > -		vq->async_pending_info = NULL;
> > +	if (vq->async_pkts_info) {
> > +		rte_free(vq->async_pkts_info);
> > +		vq->async_pkts_info = NULL;
> >  	}
> >
> >  	vq->async_ops.transfer_data = NULL;
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 73a1ed889..155a832c1 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -46,8 +46,6 @@
> >
> >  #define MAX_PKT_BURST 32
> >
> > -#define ASYNC_MAX_POLL_SEG 255
> > -
> >  #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
> >  #define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> >
> > @@ -225,12 +223,10 @@ struct vhost_virtqueue {
> >
> >  	/* async data transfer status */
> >  	uintptr_t	**async_pkts_pending;
> > -	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
> > -	#define		ASYNC_PENDING_INFO_N_SFT 16
> > -	uint64_t	*async_pending_info;
> > +	struct async_inflight_info *async_pkts_info;
> >  	uint16_t	async_pkts_idx;
> >  	uint16_t	async_pkts_inflight_n;
> > -	uint16_t	async_last_seg_n;
> > +	uint16_t	async_last_pkts_n;
> >
> >  	/* vq async features */
> >  	bool		async_inorder;
> > diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> > index 501218e19..67d2a2d43 100644
> > --- a/lib/librte_vhost/vhost_user.c
> > +++ b/lib/librte_vhost/vhost_user.c
> > @@ -2006,10 +2006,10 @@ vhost_user_get_vring_base(struct virtio_net
> **pdev,
> >  		vq->shadow_used_split = NULL;
> >  		if (vq->async_pkts_pending)
> >  			rte_free(vq->async_pkts_pending);
> > -		if (vq->async_pending_info)
> > -			rte_free(vq->async_pending_info);
> > +		if (vq->async_pkts_info)
> > +			rte_free(vq->async_pkts_info);
> >  		vq->async_pkts_pending = NULL;
> > -		vq->async_pending_info = NULL;
> > +		vq->async_pkts_info = NULL;
> >  	}
> >
> >  	rte_free(vq->batch_copy_elems);
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index bd9303c8a..68ead9a71 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1473,37 +1473,6 @@ virtio_dev_rx_async_get_info_idx(uint16_t
> pkts_idx,
> >  		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
> >  }
> >
> > -static __rte_always_inline void
> > -virtio_dev_rx_async_submit_split_err(struct virtio_net *dev,
> > -	struct vhost_virtqueue *vq, uint16_t queue_id,
> > -	uint16_t last_idx, uint16_t shadow_idx)
> > -{
> > -	uint16_t start_idx, pkts_idx, vq_size;
> > -	uint64_t *async_pending_info;
> > -
> > -	pkts_idx = vq->async_pkts_idx;
> > -	async_pending_info = vq->async_pending_info;
> > -	vq_size = vq->size;
> > -	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
> > -		vq_size, vq->async_pkts_inflight_n);
> > -
> > -	while (likely((start_idx & (vq_size - 1)) != pkts_idx)) {
> > -		uint64_t n_seg =
> > -			async_pending_info[(start_idx) & (vq_size - 1)] >>
> > -			ASYNC_PENDING_INFO_N_SFT;
> > -
> > -		while (n_seg)
> > -			n_seg -= vq-
> >async_ops.check_completed_copies(dev->vid,
> > -				queue_id, 0, 1);
> > -	}
> > -
> > -	vq->async_pkts_inflight_n = 0;
> > -	vq->batch_copy_nb_elems = 0;
> > -
> > -	vq->shadow_used_idx = shadow_idx;
> > -	vq->last_avail_idx = last_idx;
> > -}
> > -
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
> >  	struct vhost_virtqueue *vq, uint16_t queue_id,
> > @@ -1512,7 +1481,7 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> >  	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
> >  	uint16_t num_buffers;
> >  	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> > -	uint16_t avail_head, last_idx, shadow_idx;
> > +	uint16_t avail_head;
> >
> >  	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
> >  	struct iovec *vec_pool = vq->vec_pool;
> > @@ -1522,11 +1491,11 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> >  	struct rte_vhost_iov_iter *src_it = it_pool;
> >  	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
> >  	uint16_t n_free_slot, slot_idx;
> > +	uint16_t pkt_err = 0;
> > +	struct async_inflight_info *pkts_info = vq->async_pkts_info;
> >  	int n_pkts = 0;
> >
> >  	avail_head = __atomic_load_n(&vq->avail->idx,
> __ATOMIC_ACQUIRE);
> > -	last_idx = vq->last_avail_idx;
> > -	shadow_idx = vq->shadow_used_idx;
> >
> >  	/*
> >  	 * The ordering between avail index and
> > @@ -1565,14 +1534,14 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> >  		if (src_it->count) {
> >  			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
> >  			pkt_burst_idx++;
> > -			vq->async_pending_info[slot_idx] =
> > -				num_buffers | (src_it->nr_segs << 16);
> > +			pkts_info[slot_idx].descs = num_buffers;
> > +			pkts_info[slot_idx].segs = src_it->nr_segs;
> >  			src_iovec += src_it->nr_segs;
> >  			dst_iovec += dst_it->nr_segs;
> >  			src_it += 2;
> >  			dst_it += 2;
> >  		} else {
> > -			vq->async_pending_info[slot_idx] = num_buffers;
> > +			pkts_info[slot_idx].info = num_buffers;
> >  			vq->async_pkts_inflight_n++;
> >  		}
> >
> > @@ -1586,35 +1555,46 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev,
> >  			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >>
> 1);
> >  			src_it = it_pool;
> >  			dst_it = it_pool + 1;
> > +			vq->async_pkts_inflight_n += n_pkts;
> >
> >  			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
> > -				vq->async_pkts_inflight_n +=
> > -					n_pkts > 0 ? n_pkts : 0;
> > -				virtio_dev_rx_async_submit_split_err(dev,
> > -					vq, queue_id, last_idx, shadow_idx);
> > -				return 0;
> > +				/*
> > +				 * log error packets number here and do
> actual
> > +				 * error processing when applications poll
> > +				 * completion
> > +				 */
> > +				pkt_err = pkt_burst_idx - n_pkts;
> > +				pkt_burst_idx = 0;
> > +				break;
> >  			}
> >
> >  			pkt_burst_idx = 0;
> > -			vq->async_pkts_inflight_n += n_pkts;
> >  		}
> >  	}
> >
> >  	if (pkt_burst_idx) {
> >  		n_pkts = vq->async_ops.transfer_data(dev->vid,
> >  				queue_id, tdes, 0, pkt_burst_idx);
> > -		if (unlikely(n_pkts < (int)pkt_burst_idx)) {
> > -			vq->async_pkts_inflight_n += n_pkts > 0 ? n_pkts : 0;
> > -			virtio_dev_rx_async_submit_split_err(dev, vq,
> queue_id,
> > -				last_idx, shadow_idx);
> > -			return 0;
> > -		}
> > -
> >  		vq->async_pkts_inflight_n += n_pkts;
> > +
> > +		if (unlikely(n_pkts < (int)pkt_burst_idx))
> > +			pkt_err = pkt_burst_idx - n_pkts;
> >  	}
> >
> >  	do_data_copy_enqueue(dev, vq);
> >
> > +	if (unlikely(pkt_err)) {
> > +		do {
> > +			if (pkts_info[slot_idx].segs)
> > +				pkt_err--;
> > +			vq->last_avail_idx -= pkts_info[slot_idx].descs;
> > +			vq->shadow_used_idx -= pkts_info[slot_idx].descs;
> > +			vq->async_pkts_inflight_n--;
> > +			slot_idx--;
> > +			pkt_idx--;
> > +		} while (pkt_err);
> 
> That does not sound really robust.
> Maybe you could exit also on slot_idx < 0 to avoid out of range
> accesses?
> 
> } while (pkt_err && slot_idx >= 0)
>
slot_idx is unsigned type and it's design to be working like a ring index. But I do need to send a new patch to handle the case that "slot_idx==0".
For the robust, I can add a check: "pkt_idx > 0".

Thanks,

Patrick


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v4 0/4] optimize async data path
  2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
                   ` (5 preceding siblings ...)
  2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
@ 2020-10-13  1:45 ` Patrick Fu
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion Patrick Fu
                     ` (4 more replies)
  6 siblings, 5 replies; 36+ messages in thread
From: Patrick Fu @ 2020-10-13  1:45 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

This series applies optimization and fixes to the vhost
async data path.

v4:
 - fix transfer error handling in async submit function (patch 1/4)
 - add spinlock in async register function (patch 4/4)
 - no changes in patch 2/4 & 3/4

v3:
 - fix a typo in vhost error log (checkpatch warning)
 - fix travis-robot ci build warning on aarch64

v2:
 - minor rewordings on commit message
 - minor fix on poll_enenque_completion to correct a packet
   number calculation issue
 - allocate async buffer memory on the same numa with vq
 - add some comments in data path to improve readability

Patrick Fu (4):
  vhost: simplify async copy completion
  vhost: dynamically allocate async memory
  vhost: fix async vector buffer overrun
  vhost: fix async unregister deadlock

 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  80 +++++++++-----
 lib/librte_vhost/vhost.h           |  14 +--
 lib/librte_vhost/vhost_user.c      |  10 +-
 lib/librte_vhost/virtio_net.c      | 162 +++++++++++++----------------
 5 files changed, 148 insertions(+), 133 deletions(-)

-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
@ 2020-10-13  1:45   ` Patrick Fu
  2020-10-14  9:28     ` Maxime Coquelin
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 2/4] vhost: dynamically allocate async memory Patrick Fu
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-10-13  1:45 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Current async ops allows check_completed_copies() callback to return
arbitrary number of async iov segments finished from backend async
devices. This design creates complexity for vhost to handle breaking
transfer of a single packet (i.e. transfer completes in the middle
of a async descriptor) and prevents application callbacks from
leveraging hardware capability to offload the work. Thus, this patch
enforces the check_completed_copies() callback to return the number
of async memory descriptors, which is aligned with async transfer
data ops callbacks. vHost async data path are revised to work with
new ops define, which provides a clean and simplified processing.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/rte_vhost_async.h |  15 ++-
 lib/librte_vhost/vhost.c           |  20 ++--
 lib/librte_vhost/vhost.h           |   8 +-
 lib/librte_vhost/vhost_user.c      |   6 +-
 lib/librte_vhost/virtio_net.c      | 149 ++++++++++++-----------------
 5 files changed, 88 insertions(+), 110 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
index cb6284539..c73bd7c99 100644
--- a/lib/librte_vhost/rte_vhost_async.h
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -76,13 +76,26 @@ struct rte_vhost_async_channel_ops {
 	 * @param max_packets
 	 *  max number of packets could be completed
 	 * @return
-	 *  number of iov segments completed
+	 *  number of async descs completed
 	 */
 	uint32_t (*check_completed_copies)(int vid, uint16_t queue_id,
 		struct rte_vhost_async_status *opaque_data,
 		uint16_t max_packets);
 };
 
+/**
+ * inflight async packet information
+ */
+struct async_inflight_info {
+	union {
+		uint32_t info;
+		struct {
+			uint16_t descs; /* num of descs inflight */
+			uint16_t segs; /* iov segs inflight */
+		};
+	};
+};
+
 /**
  *  dma channel feature bit definition
  */
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index c7cd34e42..80adc8e59 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -333,8 +333,8 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_split);
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1559,15 +1559,15 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	vq->async_pkts_pending = rte_malloc(NULL,
 			vq->size * sizeof(uintptr_t),
 			RTE_CACHE_LINE_SIZE);
-	vq->async_pending_info = rte_malloc(NULL,
-			vq->size * sizeof(uint64_t),
+	vq->async_pkts_info = rte_malloc(NULL,
+			vq->size * sizeof(struct async_inflight_info),
 			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pending_info) {
+	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
 
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
@@ -1621,9 +1621,9 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		vq->async_pkts_pending = NULL;
 	}
 
-	if (vq->async_pending_info) {
-		rte_free(vq->async_pending_info);
-		vq->async_pending_info = NULL;
+	if (vq->async_pkts_info) {
+		rte_free(vq->async_pkts_info);
+		vq->async_pkts_info = NULL;
 	}
 
 	vq->async_ops.transfer_data = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 20ccdc9bd..08bfc41b3 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -46,8 +46,6 @@
 
 #define MAX_PKT_BURST 32
 
-#define ASYNC_MAX_POLL_SEG 255
-
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
 #define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
 
@@ -205,12 +203,10 @@ struct vhost_virtqueue {
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
-	#define		ASYNC_PENDING_INFO_N_SFT 16
-	uint64_t	*async_pending_info;
+	struct async_inflight_info *async_pkts_info;
 	uint16_t	async_pkts_idx;
 	uint16_t	async_pkts_inflight_n;
-	uint16_t	async_last_seg_n;
+	uint16_t	async_last_pkts_n;
 
 	/* vq async features */
 	bool		async_inorder;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 5d1fb9e86..1656ec736 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -1943,10 +1943,10 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
 		vq->shadow_used_split = NULL;
 		if (vq->async_pkts_pending)
 			rte_free(vq->async_pkts_pending);
-		if (vq->async_pending_info)
-			rte_free(vq->async_pending_info);
+		if (vq->async_pkts_info)
+			rte_free(vq->async_pkts_info);
 		vq->async_pkts_pending = NULL;
-		vq->async_pending_info = NULL;
+		vq->async_pkts_info = NULL;
 	}
 
 	rte_free(vq->batch_copy_elems);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 1b233279c..19ac92c85 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1474,37 +1474,6 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
 }
 
-static __rte_always_inline void
-virtio_dev_rx_async_submit_split_err(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
-	uint16_t last_idx, uint16_t shadow_idx)
-{
-	uint16_t start_idx, pkts_idx, vq_size;
-	uint64_t *async_pending_info;
-
-	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
-	vq_size = vq->size;
-	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
-		vq_size, vq->async_pkts_inflight_n);
-
-	while (likely((start_idx & (vq_size - 1)) != pkts_idx)) {
-		uint64_t n_seg =
-			async_pending_info[(start_idx) & (vq_size - 1)] >>
-			ASYNC_PENDING_INFO_N_SFT;
-
-		while (n_seg)
-			n_seg -= vq->async_ops.check_completed_copies(dev->vid,
-				queue_id, 0, 1);
-	}
-
-	vq->async_pkts_inflight_n = 0;
-	vq->batch_copy_nb_elems = 0;
-
-	vq->shadow_used_idx = shadow_idx;
-	vq->last_avail_idx = last_idx;
-}
-
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
@@ -1513,7 +1482,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
 	uint16_t num_buffers;
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-	uint16_t avail_head, last_idx, shadow_idx;
+	uint16_t avail_head;
 
 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
 	struct iovec *vec_pool = vq->vec_pool;
@@ -1523,11 +1492,11 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *src_it = it_pool;
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
+	uint16_t pkt_err = 0;
+	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
-	last_idx = vq->last_avail_idx;
-	shadow_idx = vq->shadow_used_idx;
 
 	/*
 	 * The ordering between avail index and
@@ -1566,14 +1535,14 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 		if (src_it->count) {
 			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
 			pkt_burst_idx++;
-			vq->async_pending_info[slot_idx] =
-				num_buffers | (src_it->nr_segs << 16);
+			pkts_info[slot_idx].descs = num_buffers;
+			pkts_info[slot_idx].segs = src_it->nr_segs;
 			src_iovec += src_it->nr_segs;
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
 		} else {
-			vq->async_pending_info[slot_idx] = num_buffers;
+			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
 		}
 
@@ -1587,35 +1556,44 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-				vq->async_pkts_inflight_n +=
-					n_pkts > 0 ? n_pkts : 0;
-				virtio_dev_rx_async_submit_split_err(dev,
-					vq, queue_id, last_idx, shadow_idx);
-				return 0;
+				/*
+				 * log error packets number here and do actual
+				 * error processing when applications poll
+				 * completion
+				 */
+				pkt_err = pkt_burst_idx - n_pkts;
+				pkt_burst_idx = 0;
+				break;
 			}
 
 			pkt_burst_idx = 0;
-			vq->async_pkts_inflight_n += n_pkts;
 		}
 	}
 
 	if (pkt_burst_idx) {
 		n_pkts = vq->async_ops.transfer_data(dev->vid,
 				queue_id, tdes, 0, pkt_burst_idx);
-		if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-			vq->async_pkts_inflight_n += n_pkts > 0 ? n_pkts : 0;
-			virtio_dev_rx_async_submit_split_err(dev, vq, queue_id,
-				last_idx, shadow_idx);
-			return 0;
-		}
-
 		vq->async_pkts_inflight_n += n_pkts;
+
+		if (unlikely(n_pkts < (int)pkt_burst_idx))
+			pkt_err = pkt_burst_idx - n_pkts;
 	}
 
 	do_data_copy_enqueue(dev, vq);
 
+	while (unlikely(pkt_err && pkt_idx)) {
+		if (pkts_info[slot_idx].segs)
+			pkt_err--;
+		vq->last_avail_idx -= pkts_info[slot_idx].descs;
+		vq->shadow_used_idx -= pkts_info[slot_idx].descs;
+		vq->async_pkts_inflight_n--;
+		slot_idx = (slot_idx - 1) & (vq->size - 1);
+		pkt_idx--;
+	}
+
 	n_free_slot = vq->size - vq->async_pkts_idx;
 	if (n_free_slot > pkt_idx) {
 		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
@@ -1641,10 +1619,10 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
-	uint16_t n_segs_cpl, n_pkts_put = 0, n_descs = 0;
+	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
 	uint16_t start_idx, pkts_idx, vq_size;
 	uint16_t n_inflight;
-	uint64_t *async_pending_info;
+	struct async_inflight_info *pkts_info;
 
 	if (!dev)
 		return 0;
@@ -1658,51 +1636,40 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
+	if (unlikely(!vq->async_registered)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+			dev->vid, __func__, queue_id);
+		return 0;
+	}
+
 	rte_spinlock_lock(&vq->access_lock);
 
 	n_inflight = vq->async_pkts_inflight_n;
 	pkts_idx = vq->async_pkts_idx;
-	async_pending_info = vq->async_pending_info;
+	pkts_info = vq->async_pkts_info;
 	vq_size = vq->size;
 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
 		vq_size, vq->async_pkts_inflight_n);
 
-	n_segs_cpl = vq->async_ops.check_completed_copies(vid, queue_id,
-		0, ASYNC_MAX_POLL_SEG - vq->async_last_seg_n) +
-		vq->async_last_seg_n;
+	if (count > vq->async_last_pkts_n)
+		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
+			queue_id, 0, count - vq->async_last_pkts_n);
+	n_pkts_cpl += vq->async_last_pkts_n;
 
 	rte_smp_wmb();
 
 	while (likely((n_pkts_put < count) && n_inflight)) {
-		uint64_t info = async_pending_info[
-			(start_idx + n_pkts_put) & (vq_size - 1)];
-		uint64_t n_segs;
+		uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
+		if (n_pkts_cpl && pkts_info[info_idx].segs)
+			n_pkts_cpl--;
+		else if (!n_pkts_cpl && pkts_info[info_idx].segs)
+			break;
 		n_pkts_put++;
 		n_inflight--;
-		n_descs += info & ASYNC_PENDING_INFO_N_MSK;
-		n_segs = info >> ASYNC_PENDING_INFO_N_SFT;
-
-		if (n_segs) {
-			if (unlikely(n_segs_cpl < n_segs)) {
-				n_pkts_put--;
-				n_inflight++;
-				n_descs -= info & ASYNC_PENDING_INFO_N_MSK;
-				if (n_segs_cpl) {
-					async_pending_info[
-						(start_idx + n_pkts_put) &
-						(vq_size - 1)] =
-					((n_segs - n_segs_cpl) <<
-					 ASYNC_PENDING_INFO_N_SFT) |
-					(info & ASYNC_PENDING_INFO_N_MSK);
-					n_segs_cpl = 0;
-				}
-				break;
-			}
-			n_segs_cpl -= n_segs;
-		}
+		n_descs += pkts_info[info_idx].descs;
 	}
 
-	vq->async_last_seg_n = n_segs_cpl;
+	vq->async_last_pkts_n = n_pkts_cpl;
 
 	if (n_pkts_put) {
 		vq->async_pkts_inflight_n = n_inflight;
@@ -1711,16 +1678,18 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 					n_descs, __ATOMIC_RELEASE);
 			vhost_vring_call_split(dev, vq);
 		}
-	}
 
-	if (start_idx + n_pkts_put <= vq_size) {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			n_pkts_put * sizeof(uintptr_t));
-	} else {
-		rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-			(vq_size - start_idx) * sizeof(uintptr_t));
-		rte_memcpy(&pkts[vq_size - start_idx], vq->async_pkts_pending,
-			(n_pkts_put - vq_size + start_idx) * sizeof(uintptr_t));
+		if (start_idx + n_pkts_put <= vq_size) {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				n_pkts_put * sizeof(uintptr_t));
+		} else {
+			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
+				(vq_size - start_idx) * sizeof(uintptr_t));
+			rte_memcpy(&pkts[vq_size - start_idx],
+				vq->async_pkts_pending,
+				(n_pkts_put + start_idx - vq_size) *
+				sizeof(uintptr_t));
+		}
 	}
 
 	rte_spinlock_unlock(&vq->access_lock);
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v4 2/4] vhost: dynamically allocate async memory
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-10-13  1:45   ` Patrick Fu
  2020-10-14  9:30     ` Maxime Coquelin
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 3/4] vhost: fix async vector buffer overrun Patrick Fu
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-10-13  1:45 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Allocate async internal memory buffer by rte_malloc(), replacing array
declaration inside vq structure. Dynamic allocation can help to save
memory footprint when async path is not registered.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c | 69 ++++++++++++++++++++++++++--------------
 lib/librte_vhost/vhost.h |  4 +--
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 80adc8e59..323565898 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -324,6 +324,24 @@ cleanup_device(struct virtio_net *dev, int destroy)
 	}
 }
 
+static void
+vhost_free_async_mem(struct vhost_virtqueue *vq)
+{
+	if (vq->async_pkts_pending)
+		rte_free(vq->async_pkts_pending);
+	if (vq->async_pkts_info)
+		rte_free(vq->async_pkts_info);
+	if (vq->it_pool)
+		rte_free(vq->it_pool);
+	if (vq->vec_pool)
+		rte_free(vq->vec_pool);
+
+	vq->async_pkts_pending = NULL;
+	vq->async_pkts_info = NULL;
+	vq->it_pool = NULL;
+	vq->vec_pool = NULL;
+}
+
 void
 free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -331,10 +349,7 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		rte_free(vq->shadow_used_packed);
 	else {
 		rte_free(vq->shadow_used_split);
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
+		vhost_free_async_mem(vq);
 	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
@@ -1524,6 +1539,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);
 	struct rte_vhost_async_features f;
+	int node;
 
 	if (dev == NULL || ops == NULL)
 		return -1;
@@ -1556,19 +1572,32 @@ int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		goto reg_out;
 	}
 
-	vq->async_pkts_pending = rte_malloc(NULL,
+#ifdef RTE_LIBRTE_VHOST_NUMA
+	if (get_mempolicy(&node, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR)) {
+		VHOST_LOG_CONFIG(ERR,
+			"unable to get numa information in async register. "
+			"allocating async buffer memory on the caller thread node\n");
+		node = SOCKET_ID_ANY;
+	}
+#else
+	node = SOCKET_ID_ANY;
+#endif
+
+	vq->async_pkts_pending = rte_malloc_socket(NULL,
 			vq->size * sizeof(uintptr_t),
-			RTE_CACHE_LINE_SIZE);
-	vq->async_pkts_info = rte_malloc(NULL,
+			RTE_CACHE_LINE_SIZE, node);
+	vq->async_pkts_info = rte_malloc_socket(NULL,
 			vq->size * sizeof(struct async_inflight_info),
-			RTE_CACHE_LINE_SIZE);
-	if (!vq->async_pkts_pending || !vq->async_pkts_info) {
-		if (vq->async_pkts_pending)
-			rte_free(vq->async_pkts_pending);
-
-		if (vq->async_pkts_info)
-			rte_free(vq->async_pkts_info);
-
+			RTE_CACHE_LINE_SIZE, node);
+	vq->it_pool = rte_malloc_socket(NULL,
+			VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter),
+			RTE_CACHE_LINE_SIZE, node);
+	vq->vec_pool = rte_malloc_socket(NULL,
+			VHOST_MAX_ASYNC_VEC * sizeof(struct iovec),
+			RTE_CACHE_LINE_SIZE, node);
+	if (!vq->async_pkts_pending || !vq->async_pkts_info ||
+		!vq->it_pool || !vq->vec_pool) {
+		vhost_free_async_mem(vq);
 		VHOST_LOG_CONFIG(ERR,
 				"async register failed: cannot allocate memory for vq data "
 				"(vid %d, qid: %d)\n", vid, queue_id);
@@ -1616,15 +1645,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		goto out;
 	}
 
-	if (vq->async_pkts_pending) {
-		rte_free(vq->async_pkts_pending);
-		vq->async_pkts_pending = NULL;
-	}
-
-	if (vq->async_pkts_info) {
-		rte_free(vq->async_pkts_info);
-		vq->async_pkts_info = NULL;
-	}
+	vhost_free_async_mem(vq);
 
 	vq->async_ops.transfer_data = NULL;
 	vq->async_ops.check_completed_copies = NULL;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 08bfc41b3..7d1a8d2b7 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -198,8 +198,8 @@ struct vhost_virtqueue {
 	/* operation callbacks for async dma */
 	struct rte_vhost_async_channel_ops	async_ops;
 
-	struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
-	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
+	struct rte_vhost_iov_iter *it_pool;
+	struct iovec *vec_pool;
 
 	/* async data transfer status */
 	uintptr_t	**async_pkts_pending;
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v4 3/4] vhost: fix async vector buffer overrun
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion Patrick Fu
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 2/4] vhost: dynamically allocate async memory Patrick Fu
@ 2020-10-13  1:45   ` Patrick Fu
  2020-10-14  9:33     ` Maxime Coquelin
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 4/4] vhost: fix async unregister deadlock Patrick Fu
  2020-10-15 15:40   ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Maxime Coquelin
  4 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-10-13  1:45 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

Add check on the async vector buffer usage to prevent the buf overrun.
If the unused vector buffer is not sufficient to prepare for next
packet's iov creation, an async transfer will be triggered immediately
to free the vector buffer.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.h      |  2 +-
 lib/librte_vhost/virtio_net.c | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 7d1a8d2b7..75d79f80a 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -47,7 +47,7 @@
 #define MAX_PKT_BURST 32
 
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
-#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 4)
 
 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 19ac92c85..74214e81a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1493,6 +1493,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
 	uint16_t n_free_slot, slot_idx;
 	uint16_t pkt_err = 0;
+	uint16_t segs_await = 0;
 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
 	int n_pkts = 0;
 
@@ -1541,6 +1542,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 			dst_iovec += dst_it->nr_segs;
 			src_it += 2;
 			dst_it += 2;
+			segs_await += src_it->nr_segs;
 		} else {
 			pkts_info[slot_idx].info = num_buffers;
 			vq->async_pkts_inflight_n++;
@@ -1548,14 +1550,23 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 		vq->last_avail_idx += num_buffers;
 
+		/*
+		 * conditions to trigger async device transfer:
+		 * - buffered packet number reaches transfer threshold
+		 * - this is the last packet in the burst enqueue
+		 * - unused async iov number is less than max vhost vector
+		 */
 		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
-				(pkt_idx == count - 1 && pkt_burst_idx)) {
+			(pkt_idx == count - 1 && pkt_burst_idx) ||
+			(VHOST_MAX_ASYNC_VEC / 2 - segs_await <
+			BUF_VECTOR_MAX)) {
 			n_pkts = vq->async_ops.transfer_data(dev->vid,
 					queue_id, tdes, 0, pkt_burst_idx);
 			src_iovec = vec_pool;
 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
 			src_it = it_pool;
 			dst_it = it_pool + 1;
+			segs_await = 0;
 			vq->async_pkts_inflight_n += n_pkts;
 
 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [dpdk-dev] [PATCH v4 4/4] vhost: fix async unregister deadlock
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
                     ` (2 preceding siblings ...)
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 3/4] vhost: fix async vector buffer overrun Patrick Fu
@ 2020-10-13  1:45   ` Patrick Fu
  2020-10-14  9:34     ` Maxime Coquelin
  2020-10-15 15:40   ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Maxime Coquelin
  4 siblings, 1 reply; 36+ messages in thread
From: Patrick Fu @ 2020-10-13  1:45 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang, patrick.fu

When async unregister function is invoked in certain vhost event
callbacks (e.g. vring state change), deadlock may occur due to
recursive spinlock acquire. This patch uses trylock() primitive in
the unregister API to avoid deadlock.

Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
 lib/librte_vhost/vhost.c      | 9 +++++++--
 lib/librte_vhost/vhost_user.c | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 323565898..6068c38ec 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -1633,10 +1633,15 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		return ret;
 
 	ret = 0;
-	rte_spinlock_lock(&vq->access_lock);
 
 	if (!vq->async_registered)
-		goto out;
+		return ret;
+
+	if (!rte_spinlock_trylock(&vq->access_lock)) {
+		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
+			"virt queue busy.\n");
+		return -1;
+	}
 
 	if (vq->async_pkts_inflight_n) {
 		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 1656ec736..d20c8c57a 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -1980,9 +1980,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
 		"set queue enable: %d to qp idx: %d\n",
 		enable, index);
 
-	if (!enable && dev->virtqueue[index]->async_registered) {
+	if (enable && dev->virtqueue[index]->async_registered) {
 		if (dev->virtqueue[index]->async_pkts_inflight_n) {
-			VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
+			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
 			"async inflight packets must be completed first\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
-- 
2.18.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion Patrick Fu
@ 2020-10-14  9:28     ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-14  9:28 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 10/13/20 3:45 AM, Patrick Fu wrote:
> Current async ops allows check_completed_copies() callback to return
> arbitrary number of async iov segments finished from backend async
> devices. This design creates complexity for vhost to handle breaking
> transfer of a single packet (i.e. transfer completes in the middle
> of a async descriptor) and prevents application callbacks from
> leveraging hardware capability to offload the work. Thus, this patch
> enforces the check_completed_copies() callback to return the number
> of async memory descriptors, which is aligned with async transfer
> data ops callbacks. vHost async data path are revised to work with
> new ops define, which provides a clean and simplified processing.
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/rte_vhost_async.h |  15 ++-
>  lib/librte_vhost/vhost.c           |  20 ++--
>  lib/librte_vhost/vhost.h           |   8 +-
>  lib/librte_vhost/vhost_user.c      |   6 +-
>  lib/librte_vhost/virtio_net.c      | 149 ++++++++++++-----------------
>  5 files changed, 88 insertions(+), 110 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/4] vhost: dynamically allocate async memory
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 2/4] vhost: dynamically allocate async memory Patrick Fu
@ 2020-10-14  9:30     ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-14  9:30 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 10/13/20 3:45 AM, Patrick Fu wrote:
> Allocate async internal memory buffer by rte_malloc(), replacing array
> declaration inside vq structure. Dynamic allocation can help to save
> memory footprint when async path is not registered.
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.c | 69 ++++++++++++++++++++++++++--------------
>  lib/librte_vhost/vhost.h |  4 +--
>  2 files changed, 47 insertions(+), 26 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v4 3/4] vhost: fix async vector buffer overrun
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 3/4] vhost: fix async vector buffer overrun Patrick Fu
@ 2020-10-14  9:33     ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-14  9:33 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 10/13/20 3:45 AM, Patrick Fu wrote:
> Add check on the async vector buffer usage to prevent the buf overrun.
> If the unused vector buffer is not sufficient to prepare for next
> packet's iov creation, an async transfer will be triggered immediately
> to free the vector buffer.
> 
> Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.h      |  2 +-
>  lib/librte_vhost/virtio_net.c | 13 ++++++++++++-
>  2 files changed, 13 insertions(+), 2 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v4 4/4] vhost: fix async unregister deadlock
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 4/4] vhost: fix async unregister deadlock Patrick Fu
@ 2020-10-14  9:34     ` Maxime Coquelin
  0 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-14  9:34 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 10/13/20 3:45 AM, Patrick Fu wrote:
> When async unregister function is invoked in certain vhost event
> callbacks (e.g. vring state change), deadlock may occur due to
> recursive spinlock acquire. This patch uses trylock() primitive in
> the unregister API to avoid deadlock.
> 
> Fixes: 78639d54563a ("vhost: introduce async enqueue registration API")
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/vhost.c      | 9 +++++++--
>  lib/librte_vhost/vhost_user.c | 4 ++--
>  2 files changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 323565898..6068c38ec 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -1633,10 +1633,15 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>  		return ret;
>  
>  	ret = 0;
> -	rte_spinlock_lock(&vq->access_lock);
>  
>  	if (!vq->async_registered)
> -		goto out;
> +		return ret;
> +
> +	if (!rte_spinlock_trylock(&vq->access_lock)) {
> +		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
> +			"virt queue busy.\n");
> +		return -1;
> +	}
>  
>  	if (vq->async_pkts_inflight_n) {
>  		VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index 1656ec736..d20c8c57a 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -1980,9 +1980,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
>  		"set queue enable: %d to qp idx: %d\n",
>  		enable, index);
>  
> -	if (!enable && dev->virtqueue[index]->async_registered) {
> +	if (enable && dev->virtqueue[index]->async_registered) {
>  		if (dev->virtqueue[index]->async_pkts_inflight_n) {
> -			VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
> +			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
>  			"async inflight packets must be completed first\n");
>  			return RTE_VHOST_MSG_RESULT_ERR;
>  		}
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [dpdk-dev] [PATCH v4 0/4] optimize async data path
  2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
                     ` (3 preceding siblings ...)
  2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 4/4] vhost: fix async unregister deadlock Patrick Fu
@ 2020-10-15 15:40   ` Maxime Coquelin
  4 siblings, 0 replies; 36+ messages in thread
From: Maxime Coquelin @ 2020-10-15 15:40 UTC (permalink / raw)
  To: Patrick Fu, dev, chenbo.xia; +Cc: zhihong.wang, cheng1.jiang



On 10/13/20 3:45 AM, Patrick Fu wrote:
> This series applies optimization and fixes to the vhost
> async data path.
> 
> v4:
>  - fix transfer error handling in async submit function (patch 1/4)
>  - add spinlock in async register function (patch 4/4)
>  - no changes in patch 2/4 & 3/4
> 
> v3:
>  - fix a typo in vhost error log (checkpatch warning)
>  - fix travis-robot ci build warning on aarch64
> 
> v2:
>  - minor rewordings on commit message
>  - minor fix on poll_enenque_completion to correct a packet
>    number calculation issue
>  - allocate async buffer memory on the same numa with vq
>  - add some comments in data path to improve readability
> 
> Patrick Fu (4):
>   vhost: simplify async copy completion
>   vhost: dynamically allocate async memory
>   vhost: fix async vector buffer overrun
>   vhost: fix async unregister deadlock
> 
>  lib/librte_vhost/rte_vhost_async.h |  15 ++-
>  lib/librte_vhost/vhost.c           |  80 +++++++++-----
>  lib/librte_vhost/vhost.h           |  14 +--
>  lib/librte_vhost/vhost_user.c      |  10 +-
>  lib/librte_vhost/virtio_net.c      | 162 +++++++++++++----------------
>  5 files changed, 148 insertions(+), 133 deletions(-)
> 

Applied to dpdk-next-virtio/main

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2020-10-15 15:40 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-11  1:53 [dpdk-dev] [PATCH v1 0/4] optimize async data path Patrick Fu
2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 1/4] vhost: simplify async copy completion Patrick Fu
2020-09-23  9:07   ` Maxime Coquelin
2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 2/4] vhost: dynamically alloc async memory Patrick Fu
2020-09-23  9:15   ` Maxime Coquelin
2020-09-29  5:55     ` Fu, Patrick
2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 3/4] vhost: fix async vec buf overrun Patrick Fu
2020-09-23  9:21   ` Maxime Coquelin
2020-09-29  2:23     ` Fu, Patrick
2020-09-11  1:53 ` [dpdk-dev] [PATCH v1 4/4] vhost: fix async register/unregister deadlock Patrick Fu
2020-09-29  6:29 ` [dpdk-dev] [PATCH v2 0/4] optimize async data path Patrick Fu
2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 1/4] vhost: simplify async copy completion Patrick Fu
2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 2/4] vhost: dynamically allocate async memory Patrick Fu
2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 3/4] vhost: fix async vector buffer overrun Patrick Fu
2020-09-29  6:29   ` [dpdk-dev] [PATCH v2 4/4] vhost: fix async register/unregister deadlock Patrick Fu
2020-09-29  9:29 ` [dpdk-dev] [PATCH v3 0/4] optimize async data path Patrick Fu
2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 1/4] vhost: simplify async copy completion Patrick Fu
2020-10-05 13:46     ` Maxime Coquelin
2020-10-09 11:16       ` Fu, Patrick
2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 2/4] vhost: dynamically allocate async memory Patrick Fu
2020-10-05 13:50     ` Maxime Coquelin
2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 3/4] vhost: fix async vector buffer overrun Patrick Fu
2020-10-05 14:19     ` Maxime Coquelin
2020-09-29  9:29   ` [dpdk-dev] [PATCH v3 4/4] vhost: fix async register/unregister deadlock Patrick Fu
2020-10-05 14:25     ` Maxime Coquelin
2020-10-09 10:54       ` Fu, Patrick
2020-10-13  1:45 ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Patrick Fu
2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 1/4] vhost: simplify async copy completion Patrick Fu
2020-10-14  9:28     ` Maxime Coquelin
2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 2/4] vhost: dynamically allocate async memory Patrick Fu
2020-10-14  9:30     ` Maxime Coquelin
2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 3/4] vhost: fix async vector buffer overrun Patrick Fu
2020-10-14  9:33     ` Maxime Coquelin
2020-10-13  1:45   ` [dpdk-dev] [PATCH v4 4/4] vhost: fix async unregister deadlock Patrick Fu
2020-10-14  9:34     ` Maxime Coquelin
2020-10-15 15:40   ` [dpdk-dev] [PATCH v4 0/4] optimize async data path Maxime Coquelin

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git