DPDK patches and discussions
 help / color / Atom feed
* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
       [not found]   ` <CGME20190708113801eucas1p25d89717d8b298790326077852c9933c8@eucas1p2.samsung.com>
@ 2019-07-08 11:37     ` Ilya Maximets
  2019-07-09  1:15       ` Liu, Yong
  0 siblings, 1 reply; 25+ messages in thread
From: Ilya Maximets @ 2019-07-08 11:37 UTC (permalink / raw)
  To: Marvin Liu, tiwei.bie, maxime.coquelin, dev

On 08.07.2019 20:13, Marvin Liu wrote:
> In fast enqueue function, will first check whether descriptors are
> cache aligned. Fast enqueue function will check prerequisites in the
> beginning. Fast enqueue function do not support chained mbufs, normal
> function will handle that.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 884befa85..f24026acd 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -39,6 +39,8 @@
>  
>  #define VHOST_LOG_CACHE_NR 32
>  
> +/* Used in fast packed ring functions */
> +#define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
>  /**
>   * Structure contains buffer address, length and descriptor index
>   * from vring to do scatter RX.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 003aec1d4..b877510da 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -897,6 +897,115 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	return pkt_idx;
>  }
>  
> +static __rte_always_inline uint16_t
> +virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		struct rte_mbuf **pkts)
> +{
> +	bool wrap_counter = vq->avail_wrap_counter;
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3, len, len1,
> +		len2, len3;
> +	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
> +	uint32_t buf_offset = dev->vhost_hlen;
> +
> +	if (unlikely(avail_idx & 0x3))
> +		return -1;
> +
> +	if (unlikely(avail_idx < (vq->size - PACKED_DESC_PER_CACHELINE)))

Doe it makes sense to check this?
If this condition is not 'true', all the code below will access incorrect memory.

> +		rte_prefetch0((void *)(uintptr_t)&descs[avail_idx +
> +			PACKED_DESC_PER_CACHELINE]);
> +	else
> +		rte_prefetch0((void *)(uintptr_t)&descs[0]);
> +
> +	if (unlikely((pkts[0]->next != NULL) |
> +		(pkts[1]->next != NULL) |
> +		(pkts[2]->next != NULL) |
> +		(pkts[3]->next != NULL)))
> +		return -1;
> +
> +	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
> +		return 1;
> +
> +	rte_smp_rmb();
> +
> +	len = descs[avail_idx].len;
> +	len1 = descs[avail_idx + 1].len;
> +	len2 = descs[avail_idx + 2].len;
> +	len3 = descs[avail_idx + 3].len;
> +
> +	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
> +		(pkts[1]->pkt_len > (len1 - buf_offset)) |
> +		(pkts[2]->pkt_len > (len2 - buf_offset)) |
> +		(pkts[3]->pkt_len > (len3 - buf_offset))))
> +		return -1;
> +
> +	desc_addr = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx].addr,
> +			&len,
> +			VHOST_ACCESS_RW);
> +
> +	desc_addr1 = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx + 1].addr,
> +			&len1,
> +			VHOST_ACCESS_RW);
> +
> +	desc_addr2 = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx + 2].addr,
> +			&len2,
> +			VHOST_ACCESS_RW);
> +
> +	desc_addr3 = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx + 3].addr,
> +			&len3,
> +			VHOST_ACCESS_RW);
> +
> +	if (unlikely((len != descs[avail_idx].len) |
> +		(len1 != descs[avail_idx + 1].len) |
> +		(len2 != descs[avail_idx + 2].len) |
> +		(len3 != descs[avail_idx + 3].len)))
> +		return -1;
> +
> +	hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1;
> +	hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2;
> +	hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3;
> +
> +	virtio_enqueue_offload(pkts[0], &hdr->hdr);
> +	virtio_enqueue_offload(pkts[1], &hdr1->hdr);
> +	virtio_enqueue_offload(pkts[2], &hdr2->hdr);
> +	virtio_enqueue_offload(pkts[3], &hdr3->hdr);
> +
> +	len = pkts[0]->pkt_len + dev->vhost_hlen;
> +	len1 = pkts[1]->pkt_len + dev->vhost_hlen;
> +	len2 = pkts[2]->pkt_len + dev->vhost_hlen;
> +	len3 = pkts[3]->pkt_len + dev->vhost_hlen;
> +
> +	vq->last_avail_idx += PACKED_DESC_PER_CACHELINE;

The whole function assumes that PACKED_DESC_PER_CACHELINE equals to 4,
but if it's not, it will not work correctly.

> +	if (vq->last_avail_idx >= vq->size) {
> +		vq->last_avail_idx -= vq->size;
> +		vq->avail_wrap_counter ^= 1;
> +	}
> +
> +	rte_memcpy((void *)(uintptr_t)(desc_addr + buf_offset),
> +		rte_pktmbuf_mtod_offset(pkts[0], void *, 0),
> +		pkts[0]->pkt_len);
> +	rte_memcpy((void *)(uintptr_t)(desc_addr1 + buf_offset),
> +		rte_pktmbuf_mtod_offset(pkts[1], void *, 0),
> +		pkts[1]->pkt_len);
> +	rte_memcpy((void *)(uintptr_t)(desc_addr2 + buf_offset),
> +		rte_pktmbuf_mtod_offset(pkts[2], void *, 0),
> +		pkts[2]->pkt_len);
> +	rte_memcpy((void *)(uintptr_t)(desc_addr3 + buf_offset),
> +		rte_pktmbuf_mtod_offset(pkts[3], void *, 0),
> +		pkts[3]->pkt_len);
> +
> +	return 0;
> +}
> +
>  static __rte_always_inline uint16_t
>  virtio_dev_rx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			struct rte_mbuf *pkt)
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC] vhost packed ring performance optimization
@ 2019-07-08 17:13 Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 01/13] add vhost normal enqueue function Marvin Liu
                   ` (12 more replies)
  0 siblings, 13 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Packed ring has more compact ring format and thus can significantly
reduced the number of cache miss. It can lead to better performance.
This has been approved in virtio user driver, on normal E5 Xeon cpu
single core performance can raise 12%.

http://mails.dpdk.org/archives/dev/2018-April/095470.html

However vhost performance with packed ring performance was decreased.
Through analysis, mostly extra cost was from the calculating of each
descriptor flag which depended on ring wrap counter. Moreover, both
frontend and backend need to write same descriptors which will cause
cache contention. Especially when doing vhost enqueue function, virtio
refill packed ring function may write same cache line when vhost doing
enqueue function. This kind of extra cache cost will neutralize the
benefit of reducing cache misses. 

For optimizing vhost packed ring performance, vhost enqueue and dequeue
function will be divided into fast and normal parts.

Several methods will be taken in fast path:
	Uroll burst loop function into more pieces.
	Handle descriptors in one cache line simultaneously.
	Prerequisite check that whether I/O space can copy directly
	into mbuf space and vice versa.
	Prerequisite check that whether descriptor mapping is
	successful.
	Distinguish vhost descriptor update function by enqueue and
	dequeue function.
	Buffer dequeue used descriptors as many as possible.
	Update enqueue used descriptors by cache line.
	Cached memory region structure for fast conversion.
	Defined macros for pre-calculating packed descriptors flag.

Indirect and merged packets will be handled in normal path, as most-likely
they are large packets and most of costs are in memory copy.

After all these methods done, single core vhost PvP performance with 64B
packet on Xeon 8180 can boost 35%, loopback performance measured by virtio
user pmd can boost over 45%.

Marvin Liu (13):
  add vhost normal enqueue function
  add vhost packed ring fast enqueue function
  add vhost packed ring normal dequeue function
  add vhost packed ring fast dequeue function
  add enqueue shadow used descs update and flush functions
  add vhost fast enqueue flush function
  add vhost dequeue shadow descs update function
  add vhost fast dequeue flush function
  replace vhost enqueue packed ring function
  add vhost fast zero copy dequeue packed ring function
  replace vhost dequeue packed ring function
  support inorder in vhost dequeue path
  remove useless vhost functions

 lib/librte_vhost/vhost.h      |   16 +
 lib/librte_vhost/virtio_net.c | 1048 +++++++++++++++++++++++++++------
 2 files changed, 883 insertions(+), 181 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 01/13] add vhost normal enqueue function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast " Marvin Liu
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Rewrite vhost enqueue function and meanwhile left space for later flush
enqeueue shadow used descriptors changes.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..003aec1d4 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -774,6 +774,72 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return error;
 }
 
+/*
+ * Returns -1 on fail, 0 on success
+ */
+static inline int
+vhost_enqueue_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+	struct rte_mbuf *pkt, struct buf_vector *buf_vec,
+	uint16_t *nr_descs)
+{
+	uint16_t nr_vec = 0;
+
+	uint16_t avail_idx;
+	uint16_t max_tries, tries = 0;
+
+	uint16_t buf_id = 0;
+	uint32_t len = 0;
+	uint16_t desc_count;
+
+	uint32_t size = pkt->pkt_len + dev->vhost_hlen;
+	avail_idx = vq->last_avail_idx;
+
+	if (rxvq_is_mergeable(dev))
+		max_tries = vq->size - 1;
+	else
+		max_tries = 1;
+
+	uint16_t num_buffers = 0;
+
+	while (size > 0) {
+		/*
+		 * if we tried all available ring items, and still
+		 * can't get enough buf, it means something abnormal
+		 * happened.
+		 */
+		if (unlikely(++tries > max_tries))
+			return -1;
+
+		if (unlikely(fill_vec_buf_packed(dev, vq,
+						avail_idx, &desc_count,
+						buf_vec, &nr_vec,
+						&buf_id, &len,
+						VHOST_ACCESS_RW) < 0)) {
+			return -1;
+		}
+
+		len = RTE_MIN(len, size);
+
+		size -= len;
+
+		avail_idx += desc_count;
+		if (avail_idx >= vq->size)
+			avail_idx -= vq->size;
+
+		*nr_descs += desc_count;
+		num_buffers += 1;
+	}
+
+	if (copy_mbuf_to_desc(dev, vq, pkt,
+					buf_vec, nr_vec,
+					num_buffers) < 0) {
+		return 0;
+	}
+
+	return 0;
+}
+
+
 static __rte_noinline uint32_t
 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf **pkts, uint32_t count)
@@ -831,6 +897,35 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return pkt_idx;
 }
 
+static __rte_always_inline uint16_t
+virtio_dev_rx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			struct rte_mbuf *pkt)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	uint16_t nr_descs = 0;
+
+	rte_smp_rmb();
+	if (unlikely(vhost_enqueue_normal_packed(dev, vq,
+					pkt, buf_vec, &nr_descs) < 0)) {
+		VHOST_LOG_DEBUG(VHOST_DATA,
+			"(%d) failed to get enough desc from vring\n",
+			dev->vid);
+		return 0;
+	}
+
+	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
+		dev->vid, vq->last_avail_idx,
+		vq->last_avail_idx + nr_descs);
+
+	vq->last_avail_idx += nr_descs;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	return 1;
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf **pkts, uint32_t count)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 01/13] add vhost normal enqueue function Marvin Liu
@ 2019-07-08 17:13 ` " Marvin Liu
       [not found]   ` <CGME20190708113801eucas1p25d89717d8b298790326077852c9933c8@eucas1p2.samsung.com>
                     ` (2 more replies)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 03/13] add vhost packed ring normal dequeue function Marvin Liu
                   ` (10 subsequent siblings)
  12 siblings, 3 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

In fast enqueue function, will first check whether descriptors are
cache aligned. Fast enqueue function will check prerequisites in the
beginning. Fast enqueue function do not support chained mbufs, normal
function will handle that.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa85..f24026acd 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -39,6 +39,8 @@
 
 #define VHOST_LOG_CACHE_NR 32
 
+/* Used in fast packed ring functions */
+#define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
 /**
  * Structure contains buffer address, length and descriptor index
  * from vring to do scatter RX.
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 003aec1d4..b877510da 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -897,6 +897,115 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return pkt_idx;
 }
 
+static __rte_always_inline uint16_t
+virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		struct rte_mbuf **pkts)
+{
+	bool wrap_counter = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3, len, len1,
+		len2, len3;
+	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
+	uint32_t buf_offset = dev->vhost_hlen;
+
+	if (unlikely(avail_idx & 0x3))
+		return -1;
+
+	if (unlikely(avail_idx < (vq->size - PACKED_DESC_PER_CACHELINE)))
+		rte_prefetch0((void *)(uintptr_t)&descs[avail_idx +
+			PACKED_DESC_PER_CACHELINE]);
+	else
+		rte_prefetch0((void *)(uintptr_t)&descs[0]);
+
+	if (unlikely((pkts[0]->next != NULL) |
+		(pkts[1]->next != NULL) |
+		(pkts[2]->next != NULL) |
+		(pkts[3]->next != NULL)))
+		return -1;
+
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
+		return 1;
+
+	rte_smp_rmb();
+
+	len = descs[avail_idx].len;
+	len1 = descs[avail_idx + 1].len;
+	len2 = descs[avail_idx + 2].len;
+	len3 = descs[avail_idx + 3].len;
+
+	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
+		(pkts[1]->pkt_len > (len1 - buf_offset)) |
+		(pkts[2]->pkt_len > (len2 - buf_offset)) |
+		(pkts[3]->pkt_len > (len3 - buf_offset))))
+		return -1;
+
+	desc_addr = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx].addr,
+			&len,
+			VHOST_ACCESS_RW);
+
+	desc_addr1 = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx + 1].addr,
+			&len1,
+			VHOST_ACCESS_RW);
+
+	desc_addr2 = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx + 2].addr,
+			&len2,
+			VHOST_ACCESS_RW);
+
+	desc_addr3 = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx + 3].addr,
+			&len3,
+			VHOST_ACCESS_RW);
+
+	if (unlikely((len != descs[avail_idx].len) |
+		(len1 != descs[avail_idx + 1].len) |
+		(len2 != descs[avail_idx + 2].len) |
+		(len3 != descs[avail_idx + 3].len)))
+		return -1;
+
+	hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1;
+	hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2;
+	hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3;
+
+	virtio_enqueue_offload(pkts[0], &hdr->hdr);
+	virtio_enqueue_offload(pkts[1], &hdr1->hdr);
+	virtio_enqueue_offload(pkts[2], &hdr2->hdr);
+	virtio_enqueue_offload(pkts[3], &hdr3->hdr);
+
+	len = pkts[0]->pkt_len + dev->vhost_hlen;
+	len1 = pkts[1]->pkt_len + dev->vhost_hlen;
+	len2 = pkts[2]->pkt_len + dev->vhost_hlen;
+	len3 = pkts[3]->pkt_len + dev->vhost_hlen;
+
+	vq->last_avail_idx += PACKED_DESC_PER_CACHELINE;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	rte_memcpy((void *)(uintptr_t)(desc_addr + buf_offset),
+		rte_pktmbuf_mtod_offset(pkts[0], void *, 0),
+		pkts[0]->pkt_len);
+	rte_memcpy((void *)(uintptr_t)(desc_addr1 + buf_offset),
+		rte_pktmbuf_mtod_offset(pkts[1], void *, 0),
+		pkts[1]->pkt_len);
+	rte_memcpy((void *)(uintptr_t)(desc_addr2 + buf_offset),
+		rte_pktmbuf_mtod_offset(pkts[2], void *, 0),
+		pkts[2]->pkt_len);
+	rte_memcpy((void *)(uintptr_t)(desc_addr3 + buf_offset),
+		rte_pktmbuf_mtod_offset(pkts[3], void *, 0),
+		pkts[3]->pkt_len);
+
+	return 0;
+}
+
 static __rte_always_inline uint16_t
 virtio_dev_rx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			struct rte_mbuf *pkt)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 03/13] add vhost packed ring normal dequeue function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 01/13] add vhost normal enqueue function Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast " Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 04/13] add vhost packed ring fast " Marvin Liu
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Rewrite vhost dequeue function which without used ring update.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b877510da..410837122 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1613,6 +1613,62 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return i;
 }
 
+static __rte_always_inline int
+vhost_dequeue_normal_packed(struct virtio_net *dev,
+		struct vhost_virtqueue *vq,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint16_t *buf_id, uint16_t *desc_count)
+{
+	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	uint32_t dummy_len;
+	uint16_t nr_vec = 0;
+	int err;
+
+	if (unlikely(fill_vec_buf_packed(dev, vq,
+					vq->last_avail_idx, desc_count,
+					buf_vec, &nr_vec,
+					buf_id, &dummy_len,
+					VHOST_ACCESS_RO) < 0)) {
+		return -1;
+	}
+
+	*pkts = rte_pktmbuf_alloc(mbuf_pool);
+	if (unlikely(*pkts == NULL)) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"Failed to allocate memory for mbuf.\n");
+		return -1;
+	}
+
+	err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts,
+			mbuf_pool);
+	if (unlikely(err)) {
+		rte_pktmbuf_free(*pkts);
+		return -1;
+	}
+
+	return 0;
+}
+
+static __rte_always_inline int
+virtio_dev_tx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts)
+{
+
+	uint16_t buf_id, desc_count;
+
+	if (vhost_dequeue_normal_packed(dev, vq, mbuf_pool, pkts, &buf_id,
+		&desc_count))
+			return -1;
+
+	vq->last_avail_idx += desc_count;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
 static __rte_noinline uint16_t
 virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 04/13] add vhost packed ring fast dequeue function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (2 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 03/13] add vhost packed ring normal dequeue function Marvin Liu
@ 2019-07-08 17:13 ` " Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 05/13] add enqueue shadow used descs update and flush functions Marvin Liu
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Add fast dequeue function just like enqueue function, fast dequeue
function will not support chained nor indirect descriptors, normal
function will handle that.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index f24026acd..329a7658b 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -41,6 +41,10 @@
 
 /* Used in fast packed ring functions */
 #define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
+
+/* Indicated that normal path will handle */
+#define VIRTIO_DESC_NORMAL_FLAG (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT)
+
 /**
  * Structure contains buffer address, length and descriptor index
  * from vring to do scatter RX.
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 410837122..a62e0feda 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1613,6 +1613,158 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return i;
 }
 
+static __rte_always_inline int
+vhost_dequeue_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t avail_idx,
+	uintptr_t *desc_addr, uint16_t *ids)
+{
+	bool wrap_counter = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint64_t len, len1, len2, len3;
+	uint32_t buf_offset = dev->vhost_hlen;
+
+	// check whether desc is cache aligned
+	if (unlikely(avail_idx & 0x3))
+		return -1;
+
+	// prefetch next cache line
+	if (unlikely(avail_idx  < (vq->size - PACKED_DESC_PER_CACHELINE)))
+		rte_prefetch0((void *)(uintptr_t)&descs[avail_idx +
+			PACKED_DESC_PER_CACHELINE]);
+	else
+		rte_prefetch0((void *)(uintptr_t)&descs[0]);
+
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
+		return 1;
+
+	if (unlikely((descs[avail_idx].flags & VIRTIO_DESC_NORMAL_FLAG) |
+		(descs[avail_idx + 1].flags & VIRTIO_DESC_NORMAL_FLAG) |
+		(descs[avail_idx + 2].flags & VIRTIO_DESC_NORMAL_FLAG) |
+		(descs[avail_idx + 3].flags & VIRTIO_DESC_NORMAL_FLAG)))
+		return -1;
+
+	rte_smp_rmb();
+
+	len = descs[avail_idx].len;
+	len1 = descs[avail_idx + 1].len;
+	len2 = descs[avail_idx + 2].len;
+	len3 = descs[avail_idx + 3].len;
+
+	ids[0] = descs[avail_idx].id;
+	ids[1] = descs[avail_idx + 1].id;
+	ids[2] = descs[avail_idx + 2].id;
+	ids[3] = descs[avail_idx + 3].id;
+
+	desc_addr[0] = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx].addr,
+			&len,
+			VHOST_ACCESS_RW);
+
+	desc_addr[1] = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx + 1].addr,
+			&len1,
+			VHOST_ACCESS_RW);
+
+	desc_addr[2] = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx + 2].addr,
+			&len2,
+			VHOST_ACCESS_RW);
+
+	desc_addr[3] = vhost_iova_to_vva(dev, vq,
+			descs[avail_idx + 3].addr,
+			&len3,
+			VHOST_ACCESS_RW);
+
+	if (unlikely((len != descs[avail_idx].len) |
+		(len1 != descs[avail_idx + 1].len) |
+		(len2 != descs[avail_idx + 2].len) |
+		(len3 != descs[avail_idx + 3].len))) {
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts ,4))
+		return -1;
+
+	if (unlikely(((uint64_t)(pkts[0]->buf_len - pkts[0]->data_off) <
+				(len + buf_offset)) |
+			((uint64_t)(pkts[1]->buf_len - pkts[1]->data_off) <
+				(len1 + buf_offset)) |
+			((uint64_t)(pkts[2]->buf_len - pkts[2]->data_off) <
+				(len2 + buf_offset)) |
+			((uint64_t)(pkts[3]->buf_len - pkts[3]->data_off) <
+				(len3 + buf_offset)))) {
+		rte_pktmbuf_free(pkts[0]);
+		rte_pktmbuf_free(pkts[1]);
+		rte_pktmbuf_free(pkts[2]);
+		rte_pktmbuf_free(pkts[3]);
+		return -1;
+	}
+
+	pkts[0]->pkt_len = descs[avail_idx].len - buf_offset;
+	pkts[1]->pkt_len = descs[avail_idx + 1].len - buf_offset;
+	pkts[2]->pkt_len = descs[avail_idx + 2].len - buf_offset;
+	pkts[3]->pkt_len = descs[avail_idx + 3].len - buf_offset;
+
+	pkts[0]->data_len = pkts[0]->pkt_len;
+	pkts[1]->data_len = pkts[1]->pkt_len;
+	pkts[2]->data_len = pkts[2]->pkt_len;
+	pkts[3]->data_len = pkts[3]->pkt_len;
+
+	return 0;
+}
+
+static __rte_always_inline int
+virtio_dev_tx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts)
+{
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t buf_offset = dev->vhost_hlen;
+	uintptr_t desc_addr[4];
+	uint16_t ids[4];
+	int ret;
+	struct virtio_net_hdr *hdr, *hdr1, *hdr2, *hdr3;
+
+	ret = vhost_dequeue_fast_packed(dev, vq, mbuf_pool, pkts, avail_idx,
+				desc_addr, ids);
+
+	if (ret)
+		return ret;
+
+	rte_memcpy(rte_pktmbuf_mtod_offset(pkts[0], void *, 0),
+		(void *)(uintptr_t)(desc_addr[0] + buf_offset),
+		pkts[0]->pkt_len);
+	rte_memcpy(rte_pktmbuf_mtod_offset(pkts[1], void *, 0),
+		(void *)(uintptr_t)(desc_addr[1] + buf_offset),
+		pkts[1]->pkt_len);
+	rte_memcpy(rte_pktmbuf_mtod_offset(pkts[2], void *, 0),
+		(void *)(uintptr_t)(desc_addr[2] + buf_offset),
+		pkts[2]->pkt_len);
+	rte_memcpy(rte_pktmbuf_mtod_offset(pkts[3], void *, 0),
+		(void *)(uintptr_t)(desc_addr[3] + buf_offset),
+		pkts[3]->pkt_len);
+
+	if (virtio_net_with_host_offload(dev)) {
+		hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr[0]);
+		hdr1 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[1]);
+		hdr2 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[2]);
+		hdr3 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[3]);
+		vhost_dequeue_offload(hdr, pkts[0]);
+		vhost_dequeue_offload(hdr1, pkts[1]);
+		vhost_dequeue_offload(hdr2, pkts[2]);
+		vhost_dequeue_offload(hdr3, pkts[3]);
+	}
+
+	vq->last_avail_idx += PACKED_DESC_PER_CACHELINE;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+	return 0;
+}
+
 static __rte_always_inline int
 vhost_dequeue_normal_packed(struct virtio_net *dev,
 		struct vhost_virtqueue *vq,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 05/13] add enqueue shadow used descs update and flush functions
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (3 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 04/13] add vhost packed ring fast " Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 06/13] add vhost fast enqueue flush function Marvin Liu
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Split vhost enqueue and dequeue shadow used ring update function.
Vhost enqueue shadow used descs update will be calculated by cache
line. Enqueue sshadow used descs update will be buffered until exceed
one cache line.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 329a7658b..b8198747e 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -145,6 +145,7 @@ struct vhost_virtqueue {
 		struct vring_used_elem_packed *shadow_used_packed;
 	};
 	uint16_t                shadow_used_idx;
+	uint16_t                enqueue_shadow_count;
 	struct vhost_vring_addr ring_addrs;
 
 	struct batch_copy_elem	*batch_copy_elems;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index a62e0feda..96f7a8bec 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -158,6 +158,90 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 	vhost_log_cache_sync(dev, vq);
 }
 
+static __rte_always_inline void
+flush_enqueue_used_packed(struct virtio_net *dev,
+			struct vhost_virtqueue *vq)
+{
+	int i;
+	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_idx = vq->last_used_idx;
+	uint16_t head_flags = 0;
+
+	/* Split loop in two to save memory barriers */
+	for (i = 0; i < vq->shadow_used_idx; i++) {
+		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
+		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
+
+		used_idx += vq->shadow_used_packed[i].count;
+		if (used_idx >= vq->size)
+			used_idx -= vq->size;
+	}
+
+	rte_smp_wmb();
+
+	for (i = 0; i < vq->shadow_used_idx; i++) {
+		uint16_t flags;
+
+		if (vq->shadow_used_packed[i].len)
+			flags = VRING_DESC_F_WRITE;
+		else
+			flags = 0;
+
+		if (vq->used_wrap_counter) {
+			flags |= VRING_DESC_F_USED;
+			flags |= VRING_DESC_F_AVAIL;
+		} else {
+			flags &= ~VRING_DESC_F_USED;
+			flags &= ~VRING_DESC_F_AVAIL;
+		}
+
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+			vhost_log_cache_used_vring(dev, vq,
+					vq->last_used_idx *
+					sizeof(struct vring_packed_desc),
+					sizeof(struct vring_packed_desc));
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
+
+		vq->last_used_idx += vq->shadow_used_packed[i].count;
+		if (vq->last_used_idx >= vq->size) {
+			vq->used_wrap_counter ^= 1;
+			vq->last_used_idx -= vq->size;
+		}
+	}
+
+	vq->desc_packed[head_idx].flags = head_flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
+	vq->shadow_used_idx = 0;
+	vhost_log_cache_sync(dev, vq);
+}
+
+static __rte_always_inline void
+update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq,
+				uint16_t desc_idx, uint32_t len,
+				uint16_t count)
+{
+	if (!vq->shadow_used_idx)
+		vq->enqueue_shadow_count = vq->last_used_idx & 0x3;
+
+	uint16_t i = vq->shadow_used_idx++;
+
+	vq->shadow_used_packed[i].id  = desc_idx;
+	vq->shadow_used_packed[i].len = len;
+	vq->shadow_used_packed[i].count = count;
+
+	vq->enqueue_shadow_count += count;
+}
+
 static __rte_always_inline void
 update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
 			 uint16_t desc_idx, uint32_t len, uint16_t count)
@@ -198,6 +282,24 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
 	vq->batch_copy_nb_elems = 0;
 }
 
+static __rte_always_inline void
+flush_enqueue_shadow_used_packed(struct virtio_net *dev,
+		struct vhost_virtqueue *vq, uint32_t len[],
+		uint16_t id[], uint16_t count[], uint16_t num_buffers)
+{
+	int i;
+	for (i = 0; i < num_buffers; i++) {
+		update_enqueue_shadow_used_ring_packed(vq, id[i], len[i],
+			count[i]);
+
+		if (vq->enqueue_shadow_count >= PACKED_DESC_PER_CACHELINE) {
+			do_data_copy_enqueue(dev, vq);
+			flush_enqueue_used_packed(dev, vq);
+		}
+	}
+}
+
+
 /* avoid write operation when necessary, to lessen cache issues */
 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
 	if ((var) != (val))			\
@@ -800,6 +902,9 @@ vhost_enqueue_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		max_tries = 1;
 
 	uint16_t num_buffers = 0;
+	uint32_t buffer_len[max_tries];
+	uint16_t buffer_buf_id[max_tries];
+	uint16_t buffer_desc_count[max_tries];
 
 	while (size > 0) {
 		/*
@@ -822,6 +927,10 @@ vhost_enqueue_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		size -= len;
 
+		buffer_len[num_buffers] = len;
+		buffer_buf_id[num_buffers] = buf_id;
+		buffer_desc_count[num_buffers] = desc_count;
+
 		avail_idx += desc_count;
 		if (avail_idx >= vq->size)
 			avail_idx -= vq->size;
@@ -836,6 +945,9 @@ vhost_enqueue_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		return 0;
 	}
 
+	flush_enqueue_shadow_used_packed(dev, vq, buffer_len, buffer_buf_id,
+			buffer_desc_count, num_buffers);
+
 	return 0;
 }
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 06/13] add vhost fast enqueue flush function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (4 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 05/13] add enqueue shadow used descs update and flush functions Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 07/13] add vhost dequeue shadow descs update function Marvin Liu
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Vhost fast enqueue function will flush used ring immediately.
Descriptor's flag is pre-calculated by macro.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index b8198747e..d084fe364 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -39,6 +39,10 @@
 
 #define VHOST_LOG_CACHE_NR 32
 
+/* Pre-calculated packed ring flags */
+#define VIRTIO_RX_FLAG_PACKED  (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE)
+#define VIRTIO_RX_WRAP_FLAG_PACKED (VRING_DESC_F_WRITE)
+
 /* Used in fast packed ring functions */
 #define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 96f7a8bec..9eeebe642 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -225,6 +225,63 @@ flush_enqueue_used_packed(struct virtio_net *dev,
 	vhost_log_cache_sync(dev, vq);
 }
 
+/* flags are same when flushing used ring in fast path */
+static __rte_always_inline void
+flush_used_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint64_t len, uint64_t len1, uint64_t len2, uint64_t len3,
+		uint16_t id, uint16_t id1, uint16_t id2, uint16_t id3,
+		uint16_t flags)
+{
+	vq->desc_packed[vq->last_used_idx].id = id;
+	vq->desc_packed[vq->last_used_idx].len = len;
+	vq->desc_packed[vq->last_used_idx + 1].id = id1;
+	vq->desc_packed[vq->last_used_idx + 1].len = len1;
+
+	vq->desc_packed[vq->last_used_idx + 2].id = id2;
+	vq->desc_packed[vq->last_used_idx + 2].len = len2;
+
+	vq->desc_packed[vq->last_used_idx + 3].id = id3;
+	vq->desc_packed[vq->last_used_idx + 3].len = len3;
+
+	rte_smp_wmb();
+	vq->desc_packed[vq->last_used_idx].flags = flags;
+	rte_smp_wmb();
+	vq->desc_packed[vq->last_used_idx + 1].flags = flags;
+	rte_smp_wmb();
+	vq->desc_packed[vq->last_used_idx + 2].flags = flags;
+	rte_smp_wmb();
+	vq->desc_packed[vq->last_used_idx + 3].flags = flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				vq->last_used_idx *
+				sizeof(struct vring_packed_desc),
+				RTE_CACHE_LINE_SIZE);
+	vhost_log_cache_sync(dev, vq);
+
+	vq->last_used_idx += PACKED_DESC_PER_CACHELINE;
+	if (vq->last_used_idx >= vq->size) {
+		vq->used_wrap_counter ^= 1;
+		vq->last_used_idx -= vq->size;
+	}
+}
+
+static __rte_always_inline void
+flush_enqueue_fast_used_packed(struct virtio_net *dev,
+			struct vhost_virtqueue *vq, uint64_t len,
+			uint64_t len1, uint64_t len2, uint64_t len3,
+			uint16_t id, uint16_t id1, uint16_t id2, uint16_t id3)
+{
+	uint16_t flags = 0;
+
+	if (vq->used_wrap_counter)
+		flags = VIRTIO_RX_FLAG_PACKED;
+	else
+		flags = VIRTIO_RX_WRAP_FLAG_PACKED;
+
+	flush_used_fast_packed(dev, vq, len, len1, len2, len3, id, id1, id2,
+			id3, flags);
+}
+
 static __rte_always_inline void
 update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq,
 				uint16_t desc_idx, uint32_t len,
@@ -1020,6 +1077,7 @@ virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		len2, len3;
 	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
 	uint32_t buf_offset = dev->vhost_hlen;
+	uint16_t id, id1, id2, id3;
 
 	if (unlikely(avail_idx & 0x3))
 		return -1;
@@ -1055,6 +1113,11 @@ virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		(pkts[3]->pkt_len > (len3 - buf_offset))))
 		return -1;
 
+	id = descs[avail_idx].id;
+	id1 = descs[avail_idx + 1].id;
+	id2 = descs[avail_idx + 2].id;
+	id3 = descs[avail_idx + 3].id;
+
 	desc_addr = vhost_iova_to_vva(dev, vq,
 			descs[avail_idx].addr,
 			&len,
@@ -1115,6 +1178,9 @@ virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		rte_pktmbuf_mtod_offset(pkts[3], void *, 0),
 		pkts[3]->pkt_len);
 
+	flush_enqueue_fast_used_packed(dev, vq, len, len1, len2, len3, id,
+		id1, id2, id3);
+
 	return 0;
 }
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 07/13] add vhost dequeue shadow descs update function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (5 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 06/13] add vhost fast enqueue flush function Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 08/13] add vhost fast dequeue flush function Marvin Liu
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Vhost dequeue function will buffer used descriptors as many as possible,
so that shadow used element should itself contain descriptor index and
wrap counter. First shadowed ring index also be recorded.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index d084fe364..5ccbe67b5 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -42,6 +42,8 @@
 /* Pre-calculated packed ring flags */
 #define VIRTIO_RX_FLAG_PACKED  (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE)
 #define VIRTIO_RX_WRAP_FLAG_PACKED (VRING_DESC_F_WRITE)
+#define VIRTIO_TX_FLAG_PACKED  (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED)
+#define VIRTIO_TX_WRAP_FLAG_PACKED (0x0)
 
 /* Used in fast packed ring functions */
 #define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
@@ -93,9 +95,11 @@ struct log_cache_entry {
 };
 
 struct vring_used_elem_packed {
+	uint16_t used_idx;
 	uint16_t id;
 	uint32_t len;
 	uint32_t count;
+	uint16_t used_wrap_counter;
 };
 
 /**
@@ -150,6 +154,7 @@ struct vhost_virtqueue {
 	};
 	uint16_t                shadow_used_idx;
 	uint16_t                enqueue_shadow_count;
+	uint16_t                dequeue_shadow_head;
 	struct vhost_vring_addr ring_addrs;
 
 	struct batch_copy_elem	*batch_copy_elems;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 9eeebe642..83ed2d599 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -299,6 +299,28 @@ update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq,
 	vq->enqueue_shadow_count += count;
 }
 
+static __rte_always_inline void
+update_dequeue_shadow_used_ring_packed(struct vhost_virtqueue *vq,
+				 uint16_t buf_id, uint16_t count)
+{
+	if (!vq->shadow_used_idx)
+		vq->dequeue_shadow_head = vq->last_used_idx;
+
+	uint16_t i = vq->shadow_used_idx++;
+
+	vq->shadow_used_packed[i].id  = buf_id;
+	vq->shadow_used_packed[i].len = 0;
+	vq->shadow_used_packed[i].count = count;
+	vq->shadow_used_packed[i].used_idx = vq->last_used_idx;
+	vq->shadow_used_packed[i].used_wrap_counter = vq->used_wrap_counter;
+
+	vq->last_used_idx += count;
+	if (vq->last_used_idx >= vq->size) {
+		vq->used_wrap_counter ^= 1;
+		vq->last_used_idx -= vq->size;
+	}
+}
+
 static __rte_always_inline void
 update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
 			 uint16_t desc_idx, uint32_t len, uint16_t count)
@@ -1990,6 +2012,8 @@ virtio_dev_tx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		&desc_count))
 			return -1;
 
+	update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count);
+
 	vq->last_avail_idx += desc_count;
 	if (vq->last_avail_idx >= vq->size) {
 		vq->last_avail_idx -= vq->size;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 08/13] add vhost fast dequeue flush function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (6 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 07/13] add vhost dequeue shadow descs update function Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 09/13] replace vhost enqueue packed ring function Marvin Liu
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Vhost fast dequeue function will flush used ring immediately.
Descriptor's flag is pre-calculated by macro.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 83ed2d599..cd51ed47a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -265,6 +265,21 @@ flush_used_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	}
 }
 
+static __rte_always_inline void
+flush_dequeue_fast_used_packed(struct virtio_net *dev,
+			struct vhost_virtqueue *vq, uint16_t id,
+			uint16_t id1, uint16_t id2, uint16_t id3)
+{
+	uint16_t flags = 0;
+
+	if (vq->used_wrap_counter)
+		flags = VIRTIO_TX_FLAG_PACKED;
+	else
+		flags = VIRTIO_TX_WRAP_FLAG_PACKED;
+
+	flush_used_fast_packed(dev, vq, 0, 0, 0, 0, id, id1, id2, id3, flags);
+}
+
 static __rte_always_inline void
 flush_enqueue_fast_used_packed(struct virtio_net *dev,
 			struct vhost_virtqueue *vq, uint64_t len,
@@ -1946,6 +1961,8 @@ virtio_dev_tx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		(void *)(uintptr_t)(desc_addr[3] + buf_offset),
 		pkts[3]->pkt_len);
 
+	flush_dequeue_fast_used_packed(dev, vq, ids[0], ids[1], ids[2],
+				ids[3]);
 	if (virtio_net_with_host_offload(dev)) {
 		hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr[0]);
 		hdr1 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[1]);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 09/13] replace vhost enqueue packed ring function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (7 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 08/13] add vhost fast dequeue flush function Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 10/13] add vhost fast zero copy dequeue " Marvin Liu
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Vhost enqueue function will try to handle packed descriptors four by
four. If packed descriptors can't be handled by fast path, will try
normal path. Loop will skip when normal path can't emit packet.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index cd51ed47a..a3b1e85fe 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1252,49 +1252,37 @@ virtio_dev_rx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 static __rte_noinline uint32_t
 virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
-	struct rte_mbuf **pkts, uint32_t count)
+		struct rte_mbuf **pkts, uint32_t count)
 {
 	uint32_t pkt_idx = 0;
-	uint16_t num_buffers;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	uint32_t pkt_num;
+	uint32_t remained = count;
+	int ret;
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-		uint16_t nr_vec = 0;
-		uint16_t nr_descs = 0;
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx += pkt_num,
+		remained -= pkt_num) {
+		if (remained >= PACKED_DESC_PER_CACHELINE) {
+			ret = virtio_dev_rx_fast_packed(dev, vq, pkts);
 
-		if (unlikely(reserve_avail_buf_packed(dev, vq,
-						pkt_len, buf_vec, &nr_vec,
-						&num_buffers, &nr_descs) < 0)) {
-			VHOST_LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
-			vq->shadow_used_idx -= num_buffers;
-			break;
+			if (!ret) {
+				pkt_num = PACKED_DESC_PER_CACHELINE;
+				continue;
+			}
 		}
 
-		VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-			dev->vid, vq->last_avail_idx,
-			vq->last_avail_idx + num_buffers);
+		pkt_num = virtio_dev_rx_normal_packed(dev, vq, pkts[pkt_idx]);
 
-		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
-						buf_vec, nr_vec,
-						num_buffers) < 0) {
-			vq->shadow_used_idx -= num_buffers;
+		if (pkt_num == 0)
 			break;
-		}
 
-		vq->last_avail_idx += nr_descs;
-		if (vq->last_avail_idx >= vq->size) {
-			vq->last_avail_idx -= vq->size;
-			vq->avail_wrap_counter ^= 1;
-		}
 	}
 
-	do_data_copy_enqueue(dev, vq);
+	if (pkt_idx) {
+		if (vq->shadow_used_idx) {
+			do_data_copy_enqueue(dev, vq);
+			flush_enqueue_used_packed(dev, vq);
+		}
 
-	if (likely(vq->shadow_used_idx)) {
-		flush_shadow_used_ring_packed(dev, vq);
 		vhost_vring_call_packed(dev, vq);
 	}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 10/13] add vhost fast zero copy dequeue packed ring function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (8 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 09/13] replace vhost enqueue packed ring function Marvin Liu
@ 2019-07-08 17:13 ` " Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 11/13] replace vhost " Marvin Liu
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Modified vhost zero copy dequeue function which followed fast dequeue
function.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index a3b1e85fe..7094944cf 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -2143,6 +2143,147 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return i;
 }
 
+static __rte_always_inline void
+free_zmbuf(struct vhost_virtqueue *vq) {
+	struct zcopy_mbuf *next = NULL;
+	struct zcopy_mbuf *zmbuf;
+
+	for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+	     zmbuf != NULL; zmbuf = next) {
+		next = TAILQ_NEXT(zmbuf, next);
+
+		uint16_t last_used_idx = vq->last_used_idx;
+
+		if (mbuf_is_consumed(zmbuf->mbuf)) {
+			uint16_t flags = 0;
+
+			if (vq->used_wrap_counter)
+				flags = VIRTIO_TX_FLAG_PACKED;
+			else
+				flags = VIRTIO_TX_WRAP_FLAG_PACKED;
+
+			vq->desc_packed[last_used_idx].id = zmbuf->desc_idx;
+			vq->desc_packed[last_used_idx].len = 0;
+
+			rte_smp_wmb();
+			vq->desc_packed[last_used_idx].flags = flags;
+
+			vq->last_used_idx += zmbuf->desc_count;
+			if (vq->last_used_idx >= vq->size) {
+				vq->used_wrap_counter ^= 1;
+				vq->last_used_idx -= vq->size;
+			}
+
+			TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+			restore_mbuf(zmbuf->mbuf);
+			rte_pktmbuf_free(zmbuf->mbuf);
+			put_zmbuf(zmbuf);
+			vq->nr_zmbuf -= 1;
+		}
+	}
+}
+
+static __rte_always_inline int
+virtio_dev_tx_fast_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts)
+{
+	struct zcopy_mbuf *zmbuf, *zmbuf1, *zmbuf2, *zmbuf3;
+	int ret;
+	uintptr_t desc_addr[4];
+	uint16_t ids[4];
+
+	uint16_t avail_idx = vq->last_avail_idx;
+
+	ret = vhost_dequeue_fast_packed(dev, vq, mbuf_pool, pkts, avail_idx,
+				desc_addr, ids);
+
+	if (ret)
+		return ret;
+
+	zmbuf = get_zmbuf(vq);
+	zmbuf1 = get_zmbuf(vq);
+	zmbuf2 = get_zmbuf(vq);
+	zmbuf3 = get_zmbuf(vq);
+
+	if (!zmbuf || !zmbuf1 || !zmbuf2 || !zmbuf3) {
+		rte_pktmbuf_free(pkts[0]);
+		rte_pktmbuf_free(pkts[1]);
+		rte_pktmbuf_free(pkts[2]);
+		rte_pktmbuf_free(pkts[3]);
+		return -1;
+	}
+
+	zmbuf->mbuf = pkts[0];
+	zmbuf->desc_idx = avail_idx;
+	zmbuf->desc_count = 1;
+
+	zmbuf1->mbuf = pkts[1];
+	zmbuf1->desc_idx = avail_idx + 1;
+	zmbuf1->desc_count = 1;
+
+	zmbuf2->mbuf = pkts[2];
+	zmbuf2->desc_idx = avail_idx + 2;
+	zmbuf2->desc_count = 1;
+
+	zmbuf3->mbuf = pkts[3];
+	zmbuf3->desc_idx = avail_idx + 3;
+	zmbuf3->desc_count = 1;
+
+	rte_mbuf_refcnt_update(pkts[0], 1);
+	rte_mbuf_refcnt_update(pkts[1], 1);
+	rte_mbuf_refcnt_update(pkts[2], 1);
+	rte_mbuf_refcnt_update(pkts[3], 1);
+
+	vq->nr_zmbuf += PACKED_DESC_PER_CACHELINE;
+	TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
+	TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf1, next);
+	TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf2, next);
+	TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf3, next);
+
+	vq->last_avail_idx += PACKED_DESC_PER_CACHELINE;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+static __rte_always_inline int
+virtio_dev_tx_normal_packed_zmbuf(struct virtio_net *dev,
+			struct vhost_virtqueue *vq,
+			struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts)
+{
+	uint16_t buf_id, desc_count;
+	struct zcopy_mbuf *zmbuf;
+
+	if (vhost_dequeue_normal_packed(dev, vq, mbuf_pool, pkts, &buf_id,
+					&desc_count))
+			return -1;
+
+	zmbuf = get_zmbuf(vq);
+	if (!zmbuf) {
+		rte_pktmbuf_free(*pkts);
+		return -1;
+	}
+	zmbuf->mbuf = *pkts;
+	zmbuf->desc_idx = vq->last_avail_idx;
+	zmbuf->desc_count = desc_count;
+
+	rte_mbuf_refcnt_update(*pkts, 1);
+
+	vq->nr_zmbuf += 1;
+	TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
+
+	vq->last_avail_idx += desc_count;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
 uint16_t
 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 11/13] replace vhost dequeue packed ring function
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (9 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 10/13] add vhost fast zero copy dequeue " Marvin Liu
@ 2019-07-08 17:13 ` " Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 12/13] support inorder in vhost dequeue path Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 13/13] remove useless vhost functions Marvin Liu
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Vhost dequeue function will try to handle packed descriptors four by
four. If packed descriptors can't be handled by fast path, will try
normal path. Loop will skip skip when normal path can't receive packet.
Zero copy function is following same logic.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 7094944cf..0f9292eb0 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -225,6 +225,50 @@ flush_enqueue_used_packed(struct virtio_net *dev,
 	vhost_log_cache_sync(dev, vq);
 }
 
+static __rte_always_inline void
+flush_dequeue_shadow_used_packed(struct virtio_net *dev,
+			struct vhost_virtqueue *vq)
+{
+	int i;
+	uint16_t used_idx;
+	uint16_t head_idx = vq->dequeue_shadow_head;
+	uint16_t head_flags;
+	uint16_t flags;
+
+	if (vq->shadow_used_packed[0].used_wrap_counter)
+		head_flags = VIRTIO_TX_FLAG_PACKED;
+	else
+		head_flags = VIRTIO_TX_WRAP_FLAG_PACKED;
+
+	if (vq->shadow_used_packed[0].len)
+		head_flags |= VRING_DESC_F_WRITE;
+
+	for (i = 1; i < vq->shadow_used_idx; i++) {
+		used_idx = vq->shadow_used_packed[i].used_idx;
+		if (vq->shadow_used_packed[i].used_wrap_counter)
+			flags = VIRTIO_TX_FLAG_PACKED;
+		else
+			flags = VIRTIO_TX_WRAP_FLAG_PACKED;
+
+		vq->desc_packed[used_idx].flags = flags;
+		vhost_log_cache_used_vring(dev, vq,
+					used_idx *
+					sizeof(struct vring_packed_desc),
+					sizeof(struct vring_packed_desc));
+	}
+
+	rte_smp_wmb();
+	vq->desc_packed[head_idx].flags = head_flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
+	vq->shadow_used_idx = 0;
+	vhost_log_cache_sync(dev, vq);
+}
+
 /* flags are same when flushing used ring in fast path */
 static __rte_always_inline void
 flush_used_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
@@ -393,6 +437,24 @@ flush_enqueue_shadow_used_packed(struct virtio_net *dev,
 	}
 }
 
+static __rte_always_inline void
+flush_dequeue_shadow_used(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	if (!vq->shadow_used_idx)
+		return;
+
+	int16_t shadow_count = vq->last_used_idx - vq->dequeue_shadow_head;
+	if (shadow_count <= 0)
+		shadow_count += vq->size;
+
+	/* buffer shadow used ring as many as possible when doing dequeue */
+	shadow_count += vq->last_used_idx & 0x3;
+	if ((uint16_t)shadow_count >= (vq->size >> 1)) {
+		do_data_copy_dequeue(vq);
+		flush_dequeue_shadow_used_packed(dev, vq);
+		vhost_vring_call_packed(dev, vq);
+	}
+}
 
 /* avoid write operation when necessary, to lessen cache issues */
 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
@@ -2027,120 +2089,49 @@ virtio_dev_tx_normal_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 	return 0;
 }
-
 static __rte_noinline uint16_t
 virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+		uint32_t count)
 {
-	uint16_t i;
-
-	if (unlikely(dev->dequeue_zero_copy)) {
-		struct zcopy_mbuf *zmbuf, *next;
-
-		for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
-		     zmbuf != NULL; zmbuf = next) {
-			next = TAILQ_NEXT(zmbuf, next);
+	uint32_t pkt_idx = 0;
+	uint32_t pkt_num;
+	uint32_t remained = count;
+	int ret;
 
-			if (mbuf_is_consumed(zmbuf->mbuf)) {
-				update_shadow_used_ring_packed(vq,
-						zmbuf->desc_idx,
-						0,
-						zmbuf->desc_count);
+	for (pkt_idx = 0; remained; pkt_idx += pkt_num, remained -= pkt_num) {
+		if (remained >= PACKED_DESC_PER_CACHELINE) {
+			ret = virtio_dev_tx_fast_packed(dev, vq, mbuf_pool,
+							&pkts[pkt_idx]);
 
-				TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
-				restore_mbuf(zmbuf->mbuf);
-				rte_pktmbuf_free(zmbuf->mbuf);
-				put_zmbuf(zmbuf);
-				vq->nr_zmbuf -= 1;
+			if (!ret) {
+				pkt_num = PACKED_DESC_PER_CACHELINE;
+				flush_dequeue_shadow_used(dev, vq);
+				continue;
 			}
 		}
 
-		if (likely(vq->shadow_used_idx)) {
-			flush_shadow_used_ring_packed(dev, vq);
-			vhost_vring_call_packed(dev, vq);
-		}
-	}
-
-	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-
-	count = RTE_MIN(count, MAX_PKT_BURST);
-	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
-			dev->vid, count);
-
-	for (i = 0; i < count; i++) {
-		struct buf_vector buf_vec[BUF_VECTOR_MAX];
-		uint16_t buf_id;
-		uint32_t dummy_len;
-		uint16_t desc_count, nr_vec = 0;
-		int err;
-
-		if (unlikely(fill_vec_buf_packed(dev, vq,
-						vq->last_avail_idx, &desc_count,
-						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
-						VHOST_ACCESS_RO) < 0))
-			break;
-
-		if (likely(dev->dequeue_zero_copy == 0))
-			update_shadow_used_ring_packed(vq, buf_id, 0,
-					desc_count);
-
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
-			break;
-		}
-
-		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
-				mbuf_pool);
-		if (unlikely(err)) {
-			rte_pktmbuf_free(pkts[i]);
+		/* if remained desc not cache aligned, skip to next round */
+		if (((vq->last_avail_idx & 0x3) + remained) <
+			PACKED_DESC_PER_CACHELINE)
 			break;
-		}
 
-		if (unlikely(dev->dequeue_zero_copy)) {
-			struct zcopy_mbuf *zmbuf;
-
-			zmbuf = get_zmbuf(vq);
-			if (!zmbuf) {
-				rte_pktmbuf_free(pkts[i]);
+		if (virtio_dev_tx_normal_packed(dev, vq, mbuf_pool,
+						&pkts[pkt_idx]))
 				break;
-			}
-			zmbuf->mbuf = pkts[i];
-			zmbuf->desc_idx = buf_id;
-			zmbuf->desc_count = desc_count;
-
-			/*
-			 * Pin lock the mbuf; we will check later to see
-			 * whether the mbuf is freed (when we are the last
-			 * user) or not. If that's the case, we then could
-			 * update the used ring safely.
-			 */
-			rte_mbuf_refcnt_update(pkts[i], 1);
 
-			vq->nr_zmbuf += 1;
-			TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
-		}
-
-		vq->last_avail_idx += desc_count;
-		if (vq->last_avail_idx >= vq->size) {
-			vq->last_avail_idx -= vq->size;
-			vq->avail_wrap_counter ^= 1;
-		}
+		pkt_num = 1;
+		flush_dequeue_shadow_used(dev, vq);
 	}
 
-	if (likely(dev->dequeue_zero_copy == 0)) {
-		do_data_copy_dequeue(vq);
-		if (unlikely(i < count))
-			vq->shadow_used_idx = i;
-		if (likely(vq->shadow_used_idx)) {
-			flush_shadow_used_ring_packed(dev, vq);
-			vhost_vring_call_packed(dev, vq);
-		}
+	if (pkt_idx) {
+		if (vq->shadow_used_idx)
+			do_data_copy_dequeue(vq);
+
+		vhost_vring_call_packed(dev, vq);
 	}
 
-	return i;
+	return pkt_idx;
 }
 
 static __rte_always_inline void
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 12/13] support inorder in vhost dequeue path
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (10 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 11/13] replace vhost " Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 13/13] remove useless vhost functions Marvin Liu
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

When inorder feature bit is negotiated, just update first used
descriptor with last descriptor index. It can reflect all used
descriptors.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0f9292eb0..6bcf565f0 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev)
 	return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
 }
 
+static  __rte_always_inline bool
+virtio_net_is_inorder(struct virtio_net *dev)
+{
+	return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
+}
+
 static bool
 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
 {
@@ -158,6 +164,35 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 	vhost_log_cache_sync(dev, vq);
 }
 
+static __rte_always_inline void
+flush_dequeue_shadow_used_packed_inorder(struct virtio_net *dev,
+				struct vhost_virtqueue *vq)
+{
+	uint16_t head_idx = vq->dequeue_shadow_head;
+	uint16_t head_flags = 0;
+	struct vring_used_elem_packed *last_elem;
+
+	last_elem = &vq->shadow_used_packed[vq->shadow_used_idx - 1];
+	vq->desc_packed[head_idx].id = last_elem->id + last_elem->count - 1;
+
+	if (vq->shadow_used_packed[0].used_wrap_counter)
+		head_flags = VIRTIO_TX_FLAG_PACKED;
+	else
+		head_flags = VIRTIO_TX_WRAP_FLAG_PACKED;
+
+	rte_smp_wmb();
+
+	vq->desc_packed[head_idx].flags = head_flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
+	vq->shadow_used_idx = 0;
+	vhost_log_cache_sync(dev, vq);
+}
+
 static __rte_always_inline void
 flush_enqueue_used_packed(struct virtio_net *dev,
 			struct vhost_virtqueue *vq)
@@ -269,6 +304,34 @@ flush_dequeue_shadow_used_packed(struct virtio_net *dev,
 	vhost_log_cache_sync(dev, vq);
 }
 
+static __rte_always_inline void
+flush_used_fast_packed_inorder(struct virtio_net *dev,
+			struct vhost_virtqueue *vq, uint64_t len,
+			uint64_t len1, uint64_t len2, uint64_t len3,
+			uint16_t id, uint16_t flags)
+{
+	vq->desc_packed[vq->last_used_idx].id = id;
+	vq->desc_packed[vq->last_used_idx].len = len;
+	vq->desc_packed[vq->last_used_idx + 1].len = len1;
+	vq->desc_packed[vq->last_used_idx + 2].len = len2;
+	vq->desc_packed[vq->last_used_idx + 3].len = len3;
+
+	rte_smp_wmb();
+	vq->desc_packed[vq->last_used_idx].flags = flags;
+
+	vhost_log_cache_used_vring(dev, vq,
+				vq->last_used_idx *
+				sizeof(struct vring_packed_desc),
+				RTE_CACHE_LINE_SIZE);
+	vhost_log_cache_sync(dev, vq);
+
+	vq->last_used_idx += PACKED_DESC_PER_CACHELINE;
+	if (vq->last_used_idx >= vq->size) {
+		vq->used_wrap_counter ^= 1;
+		vq->last_used_idx -= vq->size;
+	}
+}
+
 /* flags are same when flushing used ring in fast path */
 static __rte_always_inline void
 flush_used_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
@@ -320,8 +383,12 @@ flush_dequeue_fast_used_packed(struct virtio_net *dev,
 		flags = VIRTIO_TX_FLAG_PACKED;
 	else
 		flags = VIRTIO_TX_WRAP_FLAG_PACKED;
-
-	flush_used_fast_packed(dev, vq, 0, 0, 0, 0, id, id1, id2, id3, flags);
+	if (virtio_net_is_inorder(dev))
+		flush_used_fast_packed_inorder(dev, vq, 0, 0, 0, 0, id3,
+					flags);
+	else
+		flush_used_fast_packed(dev, vq, 0, 0, 0, 0, id, id1, id2, id3,
+				flags);
 }
 
 static __rte_always_inline void
@@ -451,7 +518,10 @@ flush_dequeue_shadow_used(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	shadow_count += vq->last_used_idx & 0x3;
 	if ((uint16_t)shadow_count >= (vq->size >> 1)) {
 		do_data_copy_dequeue(vq);
-		flush_dequeue_shadow_used_packed(dev, vq);
+		if (virtio_net_is_inorder(dev))
+			flush_dequeue_shadow_used_packed_inorder(dev, vq);
+		else
+			flush_dequeue_shadow_used_packed(dev, vq);
 		vhost_vring_call_packed(dev, vq);
 	}
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [RFC PATCH 13/13] remove useless vhost functions
  2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
                   ` (11 preceding siblings ...)
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 12/13] support inorder in vhost dequeue path Marvin Liu
@ 2019-07-08 17:13 ` Marvin Liu
  12 siblings, 0 replies; 25+ messages in thread
From: Marvin Liu @ 2019-07-08 17:13 UTC (permalink / raw)
  To: tiwei.bie, maxime.coquelin, dev; +Cc: Marvin Liu

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6bcf565f0..df8dcbe1f 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -97,72 +97,6 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq,
 	vq->shadow_used_split[i].len = len;
 }
 
-static __rte_always_inline void
-flush_shadow_used_ring_packed(struct virtio_net *dev,
-			struct vhost_virtqueue *vq)
-{
-	int i;
-	uint16_t used_idx = vq->last_used_idx;
-	uint16_t head_idx = vq->last_used_idx;
-	uint16_t head_flags = 0;
-
-	/* Split loop in two to save memory barriers */
-	for (i = 0; i < vq->shadow_used_idx; i++) {
-		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
-		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
-
-		used_idx += vq->shadow_used_packed[i].count;
-		if (used_idx >= vq->size)
-			used_idx -= vq->size;
-	}
-
-	rte_smp_wmb();
-
-	for (i = 0; i < vq->shadow_used_idx; i++) {
-		uint16_t flags;
-
-		if (vq->shadow_used_packed[i].len)
-			flags = VRING_DESC_F_WRITE;
-		else
-			flags = 0;
-
-		if (vq->used_wrap_counter) {
-			flags |= VRING_DESC_F_USED;
-			flags |= VRING_DESC_F_AVAIL;
-		} else {
-			flags &= ~VRING_DESC_F_USED;
-			flags &= ~VRING_DESC_F_AVAIL;
-		}
-
-		if (i > 0) {
-			vq->desc_packed[vq->last_used_idx].flags = flags;
-
-			vhost_log_cache_used_vring(dev, vq,
-					vq->last_used_idx *
-					sizeof(struct vring_packed_desc),
-					sizeof(struct vring_packed_desc));
-		} else {
-			head_idx = vq->last_used_idx;
-			head_flags = flags;
-		}
-
-		vq->last_used_idx += vq->shadow_used_packed[i].count;
-		if (vq->last_used_idx >= vq->size) {
-			vq->used_wrap_counter ^= 1;
-			vq->last_used_idx -= vq->size;
-		}
-	}
-
-	vq->desc_packed[head_idx].flags = head_flags;
-
-	vhost_log_cache_used_vring(dev, vq,
-				head_idx *
-				sizeof(struct vring_packed_desc),
-				sizeof(struct vring_packed_desc));
-
-	vq->shadow_used_idx = 0;
-	vhost_log_cache_sync(dev, vq);
-}
 
 static __rte_always_inline void
 flush_dequeue_shadow_used_packed_inorder(struct virtio_net *dev,
@@ -447,17 +381,6 @@ update_dequeue_shadow_used_ring_packed(struct vhost_virtqueue *vq,
 	}
 }
 
-static __rte_always_inline void
-update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
-			 uint16_t desc_idx, uint32_t len, uint16_t count)
-{
-	uint16_t i = vq->shadow_used_idx++;
-
-	vq->shadow_used_packed[i].id  = desc_idx;
-	vq->shadow_used_packed[i].len = len;
-	vq->shadow_used_packed[i].count = count;
-}
-
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -883,64 +806,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
-				uint32_t size, struct buf_vector *buf_vec,
-				uint16_t *nr_vec, uint16_t *num_buffers,
-				uint16_t *nr_descs)
-{
-	uint16_t avail_idx;
-	uint16_t vec_idx = 0;
-	uint16_t max_tries, tries = 0;
-
-	uint16_t buf_id = 0;
-	uint32_t len = 0;
-	uint16_t desc_count;
-
-	*num_buffers = 0;
-	avail_idx = vq->last_avail_idx;
-
-	if (rxvq_is_mergeable(dev))
-		max_tries = vq->size - 1;
-	else
-		max_tries = 1;
-
-	while (size > 0) {
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(++tries > max_tries))
-			return -1;
-
-		if (unlikely(fill_vec_buf_packed(dev, vq,
-						avail_idx, &desc_count,
-						buf_vec, &vec_idx,
-						&buf_id, &len,
-						VHOST_ACCESS_RW) < 0))
-			return -1;
-
-		len = RTE_MIN(len, size);
-		update_shadow_used_ring_packed(vq, buf_id, len, desc_count);
-		size -= len;
-
-		avail_idx += desc_count;
-		if (avail_idx >= vq->size)
-			avail_idx -= vq->size;
-
-		*nr_descs += desc_count;
-		*num_buffers += 1;
-	}
-
-	*nr_vec = vec_idx;
-
-	return 0;
-}
-
 static __rte_noinline void
 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct buf_vector *buf_vec,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-08 11:37     ` Ilya Maximets
@ 2019-07-09  1:15       ` Liu, Yong
  0 siblings, 0 replies; 25+ messages in thread
From: Liu, Yong @ 2019-07-09  1:15 UTC (permalink / raw)
  To: Ilya Maximets, dev; +Cc: Bie, Tiwei, maxime.coquelin

Thank, Ilya.

> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Monday, July 08, 2019 7:38 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
> function
> 
> On 08.07.2019 20:13, Marvin Liu wrote:
> > In fast enqueue function, will first check whether descriptors are
> > cache aligned. Fast enqueue function will check prerequisites in the
> > beginning. Fast enqueue function do not support chained mbufs, normal
> > function will handle that.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 884befa85..f24026acd 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -39,6 +39,8 @@
> >
> >  #define VHOST_LOG_CACHE_NR 32
> >
> > +/* Used in fast packed ring functions */
> > +#define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct
> vring_packed_desc))
> >  /**
> >   * Structure contains buffer address, length and descriptor index
> >   * from vring to do scatter RX.
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index 003aec1d4..b877510da 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -897,6 +897,115 @@ virtio_dev_rx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> >  	return pkt_idx;
> >  }
> >
> > +static __rte_always_inline uint16_t
> > +virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue
> *vq,
> > +		struct rte_mbuf **pkts)
> > +{
> > +	bool wrap_counter = vq->avail_wrap_counter;
> > +	struct vring_packed_desc *descs = vq->desc_packed;
> > +	uint16_t avail_idx = vq->last_avail_idx;
> > +	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3, len, len1,
> > +		len2, len3;
> > +	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
> > +	uint32_t buf_offset = dev->vhost_hlen;
> > +
> > +	if (unlikely(avail_idx & 0x3))
> > +		return -1;
> > +
> > +	if (unlikely(avail_idx < (vq->size - PACKED_DESC_PER_CACHELINE)))
> 
> Doe it makes sense to check this?
> If this condition is not 'true', all the code below will access incorrect
> memory.
> 
Sorry, this condition should be likely:)

> > +		rte_prefetch0((void *)(uintptr_t)&descs[avail_idx +
> > +			PACKED_DESC_PER_CACHELINE]);
> > +	else
> > +		rte_prefetch0((void *)(uintptr_t)&descs[0]);
> > +
> > +	if (unlikely((pkts[0]->next != NULL) |
> > +		(pkts[1]->next != NULL) |
> > +		(pkts[2]->next != NULL) |
> > +		(pkts[3]->next != NULL)))
> > +		return -1;
> > +
> > +	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
> > +		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
> > +		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
> > +		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
> > +		return 1;
> > +
> > +	rte_smp_rmb();
> > +
> > +	len = descs[avail_idx].len;
> > +	len1 = descs[avail_idx + 1].len;
> > +	len2 = descs[avail_idx + 2].len;
> > +	len3 = descs[avail_idx + 3].len;
> > +
> > +	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
> > +		(pkts[1]->pkt_len > (len1 - buf_offset)) |
> > +		(pkts[2]->pkt_len > (len2 - buf_offset)) |
> > +		(pkts[3]->pkt_len > (len3 - buf_offset))))
> > +		return -1;
> > +
> > +	desc_addr = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx].addr,
> > +			&len,
> > +			VHOST_ACCESS_RW);
> > +
> > +	desc_addr1 = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx + 1].addr,
> > +			&len1,
> > +			VHOST_ACCESS_RW);
> > +
> > +	desc_addr2 = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx + 2].addr,
> > +			&len2,
> > +			VHOST_ACCESS_RW);
> > +
> > +	desc_addr3 = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx + 3].addr,
> > +			&len3,
> > +			VHOST_ACCESS_RW);
> > +
> > +	if (unlikely((len != descs[avail_idx].len) |
> > +		(len1 != descs[avail_idx + 1].len) |
> > +		(len2 != descs[avail_idx + 2].len) |
> > +		(len3 != descs[avail_idx + 3].len)))
> > +		return -1;
> > +
> > +	hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> > +	hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1;
> > +	hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2;
> > +	hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3;
> > +
> > +	virtio_enqueue_offload(pkts[0], &hdr->hdr);
> > +	virtio_enqueue_offload(pkts[1], &hdr1->hdr);
> > +	virtio_enqueue_offload(pkts[2], &hdr2->hdr);
> > +	virtio_enqueue_offload(pkts[3], &hdr3->hdr);
> > +
> > +	len = pkts[0]->pkt_len + dev->vhost_hlen;
> > +	len1 = pkts[1]->pkt_len + dev->vhost_hlen;
> > +	len2 = pkts[2]->pkt_len + dev->vhost_hlen;
> > +	len3 = pkts[3]->pkt_len + dev->vhost_hlen;
> > +
> > +	vq->last_avail_idx += PACKED_DESC_PER_CACHELINE;
> 
> The whole function assumes that PACKED_DESC_PER_CACHELINE equals to 4,
> but if it's not, it will not work correctly.

Thanks for point out this, I just noticed that power8 has 128 bytes cache line.
Main idea here is to unroll the loop and check conditions first.
PACKED_DESC_PER_CACHELINE can be removed and renamed like FAST_BURST_NUM which hardcoded to four.

Best regards,
Marvin

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast " Marvin Liu
       [not found]   ` <CGME20190708113801eucas1p25d89717d8b298790326077852c9933c8@eucas1p2.samsung.com>
@ 2019-07-10  4:28   ` Jason Wang
  2019-07-10  7:30     ` Liu, Yong
  2019-07-11  8:35   ` Jason Wang
  2 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2019-07-10  4:28 UTC (permalink / raw)
  To: Marvin Liu, tiwei.bie, maxime.coquelin, dev


On 2019/7/9 上午1:13, Marvin Liu wrote:
> In fast enqueue function, will first check whether descriptors are
> cache aligned. Fast enqueue function will check prerequisites in the
> beginning. Fast enqueue function do not support chained mbufs, normal
> function will handle that.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 884befa85..f24026acd 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -39,6 +39,8 @@
>   
>   #define VHOST_LOG_CACHE_NR 32
>   
> +/* Used in fast packed ring functions */
> +#define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
>   /**
>    * Structure contains buffer address, length and descriptor index
>    * from vring to do scatter RX.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 003aec1d4..b877510da 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -897,6 +897,115 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>   	return pkt_idx;
>   }
>   
> +static __rte_always_inline uint16_t
> +virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		struct rte_mbuf **pkts)
> +{
> +	bool wrap_counter = vq->avail_wrap_counter;
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3, len, len1,
> +		len2, len3;
> +	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
> +	uint32_t buf_offset = dev->vhost_hlen;
> +
> +	if (unlikely(avail_idx & 0x3))
> +		return -1;
> +
> +	if (unlikely(avail_idx < (vq->size - PACKED_DESC_PER_CACHELINE)))
> +		rte_prefetch0((void *)(uintptr_t)&descs[avail_idx +
> +			PACKED_DESC_PER_CACHELINE]);
> +	else
> +		rte_prefetch0((void *)(uintptr_t)&descs[0]);
> +
> +	if (unlikely((pkts[0]->next != NULL) |
> +		(pkts[1]->next != NULL) |
> +		(pkts[2]->next != NULL) |
> +		(pkts[3]->next != NULL)))
> +		return -1;
> +
> +	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
> +		return 1;


Any reason for not letting compiler to unroll the loops?

Thanks



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-10  4:28   ` Jason Wang
@ 2019-07-10  7:30     ` Liu, Yong
  2019-07-11  4:11       ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Yong @ 2019-07-10  7:30 UTC (permalink / raw)
  To: Jason Wang, Bie, Tiwei, maxime.coquelin, dev



> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Wednesday, July 10, 2019 12:28 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
> function
> 
> 
> On 2019/7/9 上午1:13, Marvin Liu wrote:
> > In fast enqueue function, will first check whether descriptors are
> > cache aligned. Fast enqueue function will check prerequisites in the
> > beginning. Fast enqueue function do not support chained mbufs, normal
> > function will handle that.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> Any reason for not letting compiler to unroll the loops?
> 

Hi Jason,
I'm not sure about how much compiler can help on unrolling loops as it can't know how much loops will create in one call.
After force not using unroll-loop optimization by "-fno-unroll-loops", virtio_dev_rx_packed function size remained the same.
So look like gcc unroll-loop optimization do not help here. 

And fast enqueue function not only did unroll loop, it also checked cache alignment which can help performance in another side.

Regards,
Marvin

> Thanks
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-10  7:30     ` Liu, Yong
@ 2019-07-11  4:11       ` Jason Wang
  2019-07-11  9:49         ` Liu, Yong
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2019-07-11  4:11 UTC (permalink / raw)
  To: Liu, Yong, Bie, Tiwei, maxime.coquelin, dev


On 2019/7/10 下午3:30, Liu, Yong wrote:
>
>> -----Original Message-----
>> From: Jason Wang [mailto:jasowang@redhat.com]
>> Sent: Wednesday, July 10, 2019 12:28 PM
>> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
>> maxime.coquelin@redhat.com; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
>> function
>>
>>
>> On 2019/7/9 上午1:13, Marvin Liu wrote:
>>> In fast enqueue function, will first check whether descriptors are
>>> cache aligned. Fast enqueue function will check prerequisites in the
>>> beginning. Fast enqueue function do not support chained mbufs, normal
>>> function will handle that.
>>>
>>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>> Any reason for not letting compiler to unroll the loops?
>>
> Hi Jason,
> I'm not sure about how much compiler can help on unrolling loops as it can't know how much loops will create in one call.
> After force not using unroll-loop optimization by "-fno-unroll-loops", virtio_dev_rx_packed function size remained the same.
> So look like gcc unroll-loop optimization do not help here.


I meant something like "pragma GCC unroll N" just before the loop you 
want unrolled.

Thanks


>
> And fast enqueue function not only did unroll loop, it also checked cache alignment which can help performance in another side.
>
> Regards,
> Marvin
>
>> Thanks
>>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast " Marvin Liu
       [not found]   ` <CGME20190708113801eucas1p25d89717d8b298790326077852c9933c8@eucas1p2.samsung.com>
  2019-07-10  4:28   ` Jason Wang
@ 2019-07-11  8:35   ` Jason Wang
  2019-07-11  9:37     ` Liu, Yong
  2 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2019-07-11  8:35 UTC (permalink / raw)
  To: Marvin Liu, tiwei.bie, maxime.coquelin, dev


On 2019/7/9 上午1:13, Marvin Liu wrote:
> In fast enqueue function, will first check whether descriptors are
> cache aligned. Fast enqueue function will check prerequisites in the
> beginning. Fast enqueue function do not support chained mbufs, normal
> function will handle that.
>
> Signed-off-by: Marvin Liu<yong.liu@intel.com>
>
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 884befa85..f24026acd 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -39,6 +39,8 @@
>   
>   #define VHOST_LOG_CACHE_NR 32
>   
> +/* Used in fast packed ring functions */
> +#define PACKED_DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_packed_desc))
>   /**
>    * Structure contains buffer address, length and descriptor index
>    * from vring to do scatter RX.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 003aec1d4..b877510da 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -897,6 +897,115 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>   	return pkt_idx;
>   }
>   
> +static __rte_always_inline uint16_t
> +virtio_dev_rx_fast_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		struct rte_mbuf **pkts)
> +{
> +	bool wrap_counter = vq->avail_wrap_counter;
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3, len, len1,
> +		len2, len3;
> +	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
> +	uint32_t buf_offset = dev->vhost_hlen;
> +
> +	if (unlikely(avail_idx & 0x3))
> +		return -1;
> +
> +	if (unlikely(avail_idx < (vq->size - PACKED_DESC_PER_CACHELINE)))
> +		rte_prefetch0((void *)(uintptr_t)&descs[avail_idx +
> +			PACKED_DESC_PER_CACHELINE]);
> +	else
> +		rte_prefetch0((void *)(uintptr_t)&descs[0]);
> +
> +	if (unlikely((pkts[0]->next != NULL) |
> +		(pkts[1]->next != NULL) |
> +		(pkts[2]->next != NULL) |
> +		(pkts[3]->next != NULL)))
> +		return -1;
> +
> +	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
> +		return 1;
> +
> +	rte_smp_rmb();
> +
> +	len = descs[avail_idx].len;
> +	len1 = descs[avail_idx + 1].len;
> +	len2 = descs[avail_idx + 2].len;
> +	len3 = descs[avail_idx + 3].len;
> +
> +	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
> +		(pkts[1]->pkt_len > (len1 - buf_offset)) |
> +		(pkts[2]->pkt_len > (len2 - buf_offset)) |
> +		(pkts[3]->pkt_len > (len3 - buf_offset))))
> +		return -1;
> +
> +	desc_addr = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx].addr,
> +			&len,
> +			VHOST_ACCESS_RW);
> +
> +	desc_addr1 = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx + 1].addr,
> +			&len1,
> +			VHOST_ACCESS_RW);
> +
> +	desc_addr2 = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx + 2].addr,
> +			&len2,
> +			VHOST_ACCESS_RW);
> +
> +	desc_addr3 = vhost_iova_to_vva(dev, vq,
> +			descs[avail_idx + 3].addr,
> +			&len3,
> +			VHOST_ACCESS_RW);


How can you guarantee that len3 is zero after this?

Thanks


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-11  8:35   ` Jason Wang
@ 2019-07-11  9:37     ` Liu, Yong
  2019-07-11  9:44       ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Yong @ 2019-07-11  9:37 UTC (permalink / raw)
  To: Jason Wang, Bie, Tiwei, maxime.coquelin, dev



> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Thursday, July 11, 2019 4:35 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
> function
> 
> 
> On 2019/7/9 上午1:13, Marvin Liu wrote:
> > In fast enqueue function, will first check whether descriptors are
> > cache aligned. Fast enqueue function will check prerequisites in the
> > beginning. Fast enqueue function do not support chained mbufs, normal
> > function will handle that.
> >
> > Signed-off-by: Marvin Liu<yong.liu@intel.com>
> >
> > +	len = descs[avail_idx].len;
> > +	len1 = descs[avail_idx + 1].len;
> > +	len2 = descs[avail_idx + 2].len;
> > +	len3 = descs[avail_idx + 3].len;
> > +
> > +	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
> > +		(pkts[1]->pkt_len > (len1 - buf_offset)) |
> > +		(pkts[2]->pkt_len > (len2 - buf_offset)) |
> > +		(pkts[3]->pkt_len > (len3 - buf_offset))))
> > +		return -1;
> > +
> > +	desc_addr = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx].addr,
> > +			&len,
> > +			VHOST_ACCESS_RW);
> > +
> > +	desc_addr1 = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx + 1].addr,
> > +			&len1,
> > +			VHOST_ACCESS_RW);
> > +
> > +	desc_addr2 = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx + 2].addr,
> > +			&len2,
> > +			VHOST_ACCESS_RW);
> > +
> > +	desc_addr3 = vhost_iova_to_vva(dev, vq,
> > +			descs[avail_idx + 3].addr,
> > +			&len3,
> > +			VHOST_ACCESS_RW);
> 
> 
> How can you guarantee that len3 is zero after this?
> 

Jason,
Here just guarantee host mapped length of desc is same as value in desc.
If value of len matched, data can be directly copied. 

If anything wrong in address conversion, value of len will be mismatched. 
This case will be handled by normal path.

Thanks,
Marvin

> Thanks


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-11  9:37     ` Liu, Yong
@ 2019-07-11  9:44       ` Jason Wang
  0 siblings, 0 replies; 25+ messages in thread
From: Jason Wang @ 2019-07-11  9:44 UTC (permalink / raw)
  To: Liu, Yong, Bie, Tiwei, maxime.coquelin, dev


On 2019/7/11 下午5:37, Liu, Yong wrote:
>
>> -----Original Message-----
>> From: Jason Wang [mailto:jasowang@redhat.com]
>> Sent: Thursday, July 11, 2019 4:35 PM
>> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
>> maxime.coquelin@redhat.com; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
>> function
>>
>>
>> On 2019/7/9 上午1:13, Marvin Liu wrote:
>>> In fast enqueue function, will first check whether descriptors are
>>> cache aligned. Fast enqueue function will check prerequisites in the
>>> beginning. Fast enqueue function do not support chained mbufs, normal
>>> function will handle that.
>>>
>>> Signed-off-by: Marvin Liu<yong.liu@intel.com>
>>>
>>> +	len = descs[avail_idx].len;
>>> +	len1 = descs[avail_idx + 1].len;
>>> +	len2 = descs[avail_idx + 2].len;
>>> +	len3 = descs[avail_idx + 3].len;
>>> +
>>> +	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
>>> +		(pkts[1]->pkt_len > (len1 - buf_offset)) |
>>> +		(pkts[2]->pkt_len > (len2 - buf_offset)) |
>>> +		(pkts[3]->pkt_len > (len3 - buf_offset))))
>>> +		return -1;
>>> +
>>> +	desc_addr = vhost_iova_to_vva(dev, vq,
>>> +			descs[avail_idx].addr,
>>> +			&len,
>>> +			VHOST_ACCESS_RW);
>>> +
>>> +	desc_addr1 = vhost_iova_to_vva(dev, vq,
>>> +			descs[avail_idx + 1].addr,
>>> +			&len1,
>>> +			VHOST_ACCESS_RW);
>>> +
>>> +	desc_addr2 = vhost_iova_to_vva(dev, vq,
>>> +			descs[avail_idx + 2].addr,
>>> +			&len2,
>>> +			VHOST_ACCESS_RW);
>>> +
>>> +	desc_addr3 = vhost_iova_to_vva(dev, vq,
>>> +			descs[avail_idx + 3].addr,
>>> +			&len3,
>>> +			VHOST_ACCESS_RW);
>>
>> How can you guarantee that len3 is zero after this?
>>
> Jason,
> Here just guarantee host mapped length of desc is same as value in desc.
> If value of len matched, data can be directly copied.
>
> If anything wrong in address conversion, value of len will be mismatched.
> This case will be handled by normal path.
>
> Thanks,
> Marvin


Right, I miss-read the code.

Thanks


>
>> Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-11  4:11       ` Jason Wang
@ 2019-07-11  9:49         ` Liu, Yong
  2019-07-11  9:54           ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Yong @ 2019-07-11  9:49 UTC (permalink / raw)
  To: Jason Wang, Bie, Tiwei, maxime.coquelin, dev



> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Thursday, July 11, 2019 12:11 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
> function
> 
> 
> On 2019/7/10 下午3:30, Liu, Yong wrote:
> >
> >> -----Original Message-----
> >> From: Jason Wang [mailto:jasowang@redhat.com]
> >> Sent: Wednesday, July 10, 2019 12:28 PM
> >> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> >> maxime.coquelin@redhat.com; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast
> enqueue
> >> function
> >>
> >>
> >> On 2019/7/9 上午1:13, Marvin Liu wrote:
> >>> In fast enqueue function, will first check whether descriptors are
> >>> cache aligned. Fast enqueue function will check prerequisites in the
> >>> beginning. Fast enqueue function do not support chained mbufs, normal
> >>> function will handle that.
> >>>
> >>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >> Any reason for not letting compiler to unroll the loops?
> >>
> > Hi Jason,
> > I'm not sure about how much compiler can help on unrolling loops as it
> can't know how much loops will create in one call.
> > After force not using unroll-loop optimization by "-fno-unroll-loops",
> virtio_dev_rx_packed function size remained the same.
> > So look like gcc unroll-loop optimization do not help here.
> 
> 
> I meant something like "pragma GCC unroll N" just before the loop you
> want unrolled.
> 
> Thanks
> 

Hi Jason,
Just tired with gcc8.3.0 and master code, only 0.1Mpps performance gain with "#pragma GCC unroll". 
I think this compiler pragma is not helpful in the big loop which contained so much functions. 

Thanks,
Marvin

> 
> >
> > And fast enqueue function not only did unroll loop, it also checked cache
> alignment which can help performance in another side.
> >
> > Regards,
> > Marvin
> >
> >> Thanks
> >>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-11  9:49         ` Liu, Yong
@ 2019-07-11  9:54           ` Jason Wang
  2019-08-13  9:02             ` Liu, Yong
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2019-07-11  9:54 UTC (permalink / raw)
  To: Liu, Yong, Bie, Tiwei, maxime.coquelin, dev


On 2019/7/11 下午5:49, Liu, Yong wrote:
>
>> -----Original Message-----
>> From: Jason Wang [mailto:jasowang@redhat.com]
>> Sent: Thursday, July 11, 2019 12:11 PM
>> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
>> maxime.coquelin@redhat.com; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue
>> function
>>
>>
>> On 2019/7/10 下午3:30, Liu, Yong wrote:
>>>> -----Original Message-----
>>>> From: Jason Wang [mailto:jasowang@redhat.com]
>>>> Sent: Wednesday, July 10, 2019 12:28 PM
>>>> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
>>>> maxime.coquelin@redhat.com; dev@dpdk.org
>>>> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast
>> enqueue
>>>> function
>>>>
>>>>
>>>> On 2019/7/9 上午1:13, Marvin Liu wrote:
>>>>> In fast enqueue function, will first check whether descriptors are
>>>>> cache aligned. Fast enqueue function will check prerequisites in the
>>>>> beginning. Fast enqueue function do not support chained mbufs, normal
>>>>> function will handle that.
>>>>>
>>>>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>>>> Any reason for not letting compiler to unroll the loops?
>>>>
>>> Hi Jason,
>>> I'm not sure about how much compiler can help on unrolling loops as it
>> can't know how much loops will create in one call.
>>> After force not using unroll-loop optimization by "-fno-unroll-loops",
>> virtio_dev_rx_packed function size remained the same.
>>> So look like gcc unroll-loop optimization do not help here.
>>
>> I meant something like "pragma GCC unroll N" just before the loop you
>> want unrolled.
>>
>> Thanks
>>
> Hi Jason,
> Just tired with gcc8.3.0 and master code, only 0.1Mpps performance gain with "#pragma GCC unroll".
> I think this compiler pragma is not helpful in the big loop which contained so much functions.
>
> Thanks,
> Marvin


Yes, it probably need some trick e.g break the big loop into small ones. 
What I want do here is unroll the loop based on 
PACKED_DESC_PER_CACHELINE instead of a hard-coded 4.

Thanks


>>> And fast enqueue function not only did unroll loop, it also checked cache
>> alignment which can help performance in another side.
>>> Regards,
>>> Marvin
>>>
>>>> Thanks
>>>>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast enqueue function
  2019-07-11  9:54           ` Jason Wang
@ 2019-08-13  9:02             ` Liu, Yong
  0 siblings, 0 replies; 25+ messages in thread
From: Liu, Yong @ 2019-08-13  9:02 UTC (permalink / raw)
  To: Jason Wang; +Cc: dev, Bie, Tiwei, maxime.coquelin

Hi Jason,
Unrolled option effect is highly dependent on compilers. Just tried some compilers around my side.
Vhost en-queue/de-queue path is separated into small parts which can assure compilers can do unroll optimization.
Since only GCC8 support unroll program, only GCC8 added "#pragma GCC unroll".

GCC8 and Clang shown much less performance gap than ICC and elder GCC. 
Now we have one better performance with fixed batch version code and another less performance with auto unrolled version.
What's your option on the choice? Thanks in advance. 

|----------------|---------------|-------------|------|
| Compiler       | Auto unrolled | Fixed batch | Gap  |
|----------------|---------------|-------------|------|
| Clang6.0.0     | 13.1M         | 13.5M       | 0.4M |
|----------------|---------------|-------------|------|
| GCC 8.3.0      | 13.9M         | 14.4M       | 0.5M |
|----------------|---------------|-------------|------|
| GCC 7.4.0      | 12.6M         | 13.5M       | 0.9M |
|----------------|---------------|-------------|------|
| ICC 19.0.4.243 | 11.0M         | 12.3M       | 1.3M |
|----------------|---------------|-------------|------|

Regards,
Marvin

> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Thursday, July 11, 2019 5:55 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast
> enqueue function
> 
> 
> On 2019/7/11 下午5:49, Liu, Yong wrote:
> >
> >> -----Original Message-----
> >> From: Jason Wang [mailto:jasowang@redhat.com]
> >> Sent: Thursday, July 11, 2019 12:11 PM
> >> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> >> maxime.coquelin@redhat.com; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast
> enqueue
> >> function
> >>
> >>
> >> On 2019/7/10 下午3:30, Liu, Yong wrote:
> >>>> -----Original Message-----
> >>>> From: Jason Wang [mailto:jasowang@redhat.com]
> >>>> Sent: Wednesday, July 10, 2019 12:28 PM
> >>>> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> >>>> maxime.coquelin@redhat.com; dev@dpdk.org
> >>>> Subject: Re: [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast
> >> enqueue
> >>>> function
> >>>>
> >>>>
> >>>> On 2019/7/9 上午1:13, Marvin Liu wrote:
> >>>>> In fast enqueue function, will first check whether descriptors are
> >>>>> cache aligned. Fast enqueue function will check prerequisites in the
> >>>>> beginning. Fast enqueue function do not support chained mbufs, normal
> >>>>> function will handle that.
> >>>>>
> >>>>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >>>> Any reason for not letting compiler to unroll the loops?
> >>>>
> >>> Hi Jason,
> >>> I'm not sure about how much compiler can help on unrolling loops as it
> >> can't know how much loops will create in one call.
> >>> After force not using unroll-loop optimization by "-fno-unroll-loops",
> >> virtio_dev_rx_packed function size remained the same.
> >>> So look like gcc unroll-loop optimization do not help here.
> >>
> >> I meant something like "pragma GCC unroll N" just before the loop you
> >> want unrolled.
> >>
> >> Thanks
> >>
> > Hi Jason,
> > Just tired with gcc8.3.0 and master code, only 0.1Mpps performance gain
> with "#pragma GCC unroll".
> > I think this compiler pragma is not helpful in the big loop which
> contained so much functions.
> >
> > Thanks,
> > Marvin
> 
> 
> Yes, it probably need some trick e.g break the big loop into small ones.
> What I want do here is unroll the loop based on
> PACKED_DESC_PER_CACHELINE instead of a hard-coded 4.
> 
> Thanks
> 
> 
> >>> And fast enqueue function not only did unroll loop, it also checked
> cache
> >> alignment which can help performance in another side.
> >>> Regards,
> >>> Marvin
> >>>
> >>>> Thanks
> >>>>

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, back to index

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-08 17:13 [dpdk-dev] [RFC] vhost packed ring performance optimization Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 01/13] add vhost normal enqueue function Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 02/13] add vhost packed ring fast " Marvin Liu
     [not found]   ` <CGME20190708113801eucas1p25d89717d8b298790326077852c9933c8@eucas1p2.samsung.com>
2019-07-08 11:37     ` Ilya Maximets
2019-07-09  1:15       ` Liu, Yong
2019-07-10  4:28   ` Jason Wang
2019-07-10  7:30     ` Liu, Yong
2019-07-11  4:11       ` Jason Wang
2019-07-11  9:49         ` Liu, Yong
2019-07-11  9:54           ` Jason Wang
2019-08-13  9:02             ` Liu, Yong
2019-07-11  8:35   ` Jason Wang
2019-07-11  9:37     ` Liu, Yong
2019-07-11  9:44       ` Jason Wang
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 03/13] add vhost packed ring normal dequeue function Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 04/13] add vhost packed ring fast " Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 05/13] add enqueue shadow used descs update and flush functions Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 06/13] add vhost fast enqueue flush function Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 07/13] add vhost dequeue shadow descs update function Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 08/13] add vhost fast dequeue flush function Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 09/13] replace vhost enqueue packed ring function Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 10/13] add vhost fast zero copy dequeue " Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 11/13] replace vhost " Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 12/13] support inorder in vhost dequeue path Marvin Liu
2019-07-08 17:13 ` [dpdk-dev] [RFC PATCH 13/13] remove useless vhost functions Marvin Liu

DPDK patches and discussions

Archives are clonable:
	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev


Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox