DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags
@ 2019-08-27  8:19 Joyce Kong
  2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
                   ` (10 more replies)
  0 siblings, 11 replies; 30+ messages in thread
From: Joyce Kong @ 2019-08-27  8:19 UTC (permalink / raw)
  To: dev; +Cc: nd, maxime.coquelin, thomas, honnappa.nagarahalli, gavin.hu

This patch set replaces the two-way barriers with C11 one-way barriers
for packed vring flags, when the frontend and backend are implemented
in software.

By doing vhost-user + virtio-user case benchmarking, 9% performance gain
in the RFC2544 test was measured on Thunderx2 platform.[1]

[1]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
   PVP test with virtio 1.1 mergeable path

Joyce Kong (2):
  virtio: one way barrier for packed vring desc avail flags
  virtio: one way barrier for packed vring desc used flags

 drivers/net/virtio/virtio_rxtx.c                 | 38 ++++++++++++++++++------
 drivers/net/virtio/virtio_user/virtio_user_dev.c | 10 +++++--
 drivers/net/virtio/virtqueue.h                   |  7 ++++-
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 16 +++++-----
 5 files changed, 50 insertions(+), 23 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [RFC PATCH 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
@ 2019-08-27  8:19 ` Joyce Kong
  2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 30+ messages in thread
From: Joyce Kong @ 2019-08-27  8:19 UTC (permalink / raw)
  To: dev; +Cc: nd, maxime.coquelin, thomas, honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration. Thus a weak form of memory
barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for
this case(vq->hw->weak_barriers == 1) and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers.

Meanwhile, a read barrier is required to ensure ordering between
descriptor's flags and content reads[1]. With C11, load-acquire can
enforce the ordering instead of rmb barrier.

[1]https://patchwork.dpdk.org/patch/49109/

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 26 ++++++++++++++++++------
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 11 +++++-----
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 27ead19..2a2153c 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -456,8 +456,14 @@ virtqueue_enqueue_recv_refill_packed(struct virtqueue *vq,
 		vq->vq_desc_head_idx = dxp->next;
 		if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
 			vq->vq_desc_tail_idx = vq->vq_desc_head_idx;
-		virtio_wmb(hw->weak_barriers);
-		start_dp[idx].flags = flags;
+
+		if (hw->weak_barriers)
+			__atomic_store_n(&start_dp[idx].flags, flags,
+					 __ATOMIC_RELEASE);
+		else {
+			rte_cio_wmb();
+			start_dp[idx].flags = flags;
+		}
 		if (++vq->vq_avail_idx >= vq->vq_nentries) {
 			vq->vq_avail_idx -= vq->vq_nentries;
 			vq->vq_packed.cached_flags ^=
@@ -671,8 +677,12 @@ virtqueue_enqueue_xmit_packed_fast(struct virtnet_tx *txvq,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	dp->flags = flags;
+	if (vq->hw->weak_barriers)
+		__atomic_store_n(&dp->flags, flags, __ATOMIC_RELEASE);
+	else {
+		rte_cio_wmb();
+		dp->flags = flags;
+	}
 }
 
 static inline void
@@ -763,8 +773,12 @@ virtqueue_enqueue_xmit_packed(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	head_dp->flags = head_flags;
+	if (vq->hw->weak_barriers)
+		__atomic_store_n(&head_dp->flags, head_flags, __ATOMIC_RELEASE);
+	else {
+		rte_cio_wmb();
+		head_dp->flags = head_flags;
+	}
 }
 
 static inline void
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index fab87eb..7911c39 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -624,7 +624,7 @@ virtio_user_handle_ctrl_msg(struct virtio_user_dev *dev, struct vring *vring,
 static inline int
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = desc->flags;
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_PACKED_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_PACKED_DESC_F_USED);
@@ -684,6 +684,10 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 	struct vring_packed *vring = &dev->packed_vrings[queue_idx];
 	uint16_t n_descs, flags;
 
+	/* Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
+	 */
 	while (desc_is_avail(&vring->desc[vq->used_idx],
 			     vq->used_wrap_counter)) {
 
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa..d294ed1 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -344,7 +344,7 @@ vq_is_packed(struct virtio_net *dev)
 static inline bool
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = *((volatile uint16_t *) &desc->flags);
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_DESC_F_USED);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b83..e7463ff 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -503,14 +503,13 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (avail_idx < vq->last_avail_idx)
 		wrap_counter ^= 1;
 
-	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
-		return -1;
-
 	/*
-	 * The ordering between desc flags and desc
-	 * content reads need to be enforced.
+	 * Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
 	 */
-	rte_smp_rmb();
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
+		return -1;
 
 	*desc_count = 0;
 	*len = 0;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [RFC PATCH 2/2] virtio: one way barrier for packed vring desc used flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
  2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-08-27  8:19 ` Joyce Kong
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 30+ messages in thread
From: Joyce Kong @ 2019-08-27  8:19 UTC (permalink / raw)
  To: dev; +Cc: nd, maxime.coquelin, thomas, honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
frontend and backend are assumed to be implemented in software,
that is they can run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other
than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers
== 1) and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++++---
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
 drivers/net/virtio/virtqueue.h                   |  7 ++++++-
 lib/librte_vhost/virtio_net.c                    |  5 ++---
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 2a2153c..1d818c8 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -122,9 +122,11 @@ virtqueue_dequeue_burst_rx_packed(struct virtqueue *vq,
 
 	for (i = 0; i < num; i++) {
 		used_idx = vq->vq_used_cons_idx;
+		/* desc_is_used has a load-acquire or rte_cio_rmb inside
+		 * and wait for used desc in virtqueue.
+		 */
 		if (!desc_is_used(&desc[used_idx], vq))
 			return i;
-		virtio_rmb(vq->hw->weak_barriers);
 		len[i] = desc[used_idx].len;
 		id = desc[used_idx].id;
 		cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
@@ -233,8 +235,10 @@ virtio_xmit_cleanup_inorder_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		do {
 			curr_id = used_idx;
@@ -265,8 +269,10 @@ virtio_xmit_cleanup_normal_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num-- && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		dxp = &vq->vq_descx[id];
 		vq->vq_used_cons_idx += dxp->ndescs;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 7911c39..1c575d0 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -698,8 +698,8 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 		if (vq->used_wrap_counter)
 			flags |= VRING_PACKED_DESC_F_AVAIL_USED;
 
-		rte_smp_wmb();
-		vring->desc[vq->used_idx].flags = flags;
+		__atomic_store_n(&vring->desc[vq->used_idx].flags, flags,
+				 __ATOMIC_RELEASE);
 
 		vq->used_idx += n_descs;
 		if (vq->used_idx >= dev->queue_size) {
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index c6dd4a3..ee6fcbb 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -286,7 +286,12 @@ desc_is_used(struct vring_packed_desc *desc, struct virtqueue *vq)
 {
 	uint16_t used, avail, flags;
 
-	flags = desc->flags;
+	if (vq->hw->weak_barriers)
+		flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
+	else {
+		flags = desc->flags;
+		rte_cio_rmb();
+	}
 	used = !!(flags & VRING_PACKED_DESC_F_USED);
 	avail = !!(flags & VRING_PACKED_DESC_F_AVAIL);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e7463ff..241d467 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -110,8 +110,6 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 			used_idx -= vq->size;
 	}
 
-	rte_smp_wmb();
-
 	for (i = 0; i < vq->shadow_used_idx; i++) {
 		uint16_t flags;
 
@@ -147,7 +145,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 		}
 	}
 
-	vq->desc_packed[head_idx].flags = head_flags;
+	__atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags,
+			 __ATOMIC_RELEASE);
 
 	vhost_log_cache_used_vring(dev, vq,
 				head_idx *
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 0/2] virtio: one way barrier for packed vring flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
  2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
  2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
@ 2019-09-06 11:34 ` Joyce Kong
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 30+ messages in thread
From: Joyce Kong @ 2019-09-06 11:34 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, amorenoz,
	xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli, gavin.hu

This patch set replaces the two-way barriers with C11 one-way barriers
for packed vring flags, when the frontend and backend are implemented
in software.

By doing vhost-user + virtio-user case benchmarking, 9% performance gain
in the RFC2544 test was measured on Thunderx2 platform.[1]

[1]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
   PVP test with virtio 1.1 mergeable path

v2: convert RFC to patch

Joyce Kong (2):
  virtio: one way barrier for packed vring desc avail flags
  virtio: one way barrier for packed vring desc used flags

 drivers/net/virtio/virtio_rxtx.c                 | 38 ++++++++++++++++++------
 drivers/net/virtio/virtio_user/virtio_user_dev.c | 10 +++++--
 drivers/net/virtio/virtqueue.h                   |  7 ++++-
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 16 +++++-----
 5 files changed, 50 insertions(+), 23 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (2 preceding siblings ...)
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 0/2] virtio: one way barrier for packed vring flags Joyce Kong
@ 2019-09-06 11:34 ` Joyce Kong
  2019-09-06 16:01   ` Maxime Coquelin
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Joyce Kong @ 2019-09-06 11:34 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, amorenoz,
	xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration. Thus a weak form of memory
barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for
this case(vq->hw->weak_barriers == 1) and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers.

Meanwhile, a read barrier is required to ensure ordering between
descriptor's flags and content reads[1]. With C11, load-acquire can
enforce the ordering instead of rmb barrier.

[1]https://patchwork.dpdk.org/patch/49109/

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 26 ++++++++++++++++++------
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 11 +++++-----
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 27ead19..2a2153c 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -456,8 +456,14 @@ virtqueue_enqueue_recv_refill_packed(struct virtqueue *vq,
 		vq->vq_desc_head_idx = dxp->next;
 		if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
 			vq->vq_desc_tail_idx = vq->vq_desc_head_idx;
-		virtio_wmb(hw->weak_barriers);
-		start_dp[idx].flags = flags;
+
+		if (hw->weak_barriers)
+			__atomic_store_n(&start_dp[idx].flags, flags,
+					 __ATOMIC_RELEASE);
+		else {
+			rte_cio_wmb();
+			start_dp[idx].flags = flags;
+		}
 		if (++vq->vq_avail_idx >= vq->vq_nentries) {
 			vq->vq_avail_idx -= vq->vq_nentries;
 			vq->vq_packed.cached_flags ^=
@@ -671,8 +677,12 @@ virtqueue_enqueue_xmit_packed_fast(struct virtnet_tx *txvq,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	dp->flags = flags;
+	if (vq->hw->weak_barriers)
+		__atomic_store_n(&dp->flags, flags, __ATOMIC_RELEASE);
+	else {
+		rte_cio_wmb();
+		dp->flags = flags;
+	}
 }
 
 static inline void
@@ -763,8 +773,12 @@ virtqueue_enqueue_xmit_packed(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	head_dp->flags = head_flags;
+	if (vq->hw->weak_barriers)
+		__atomic_store_n(&head_dp->flags, head_flags, __ATOMIC_RELEASE);
+	else {
+		rte_cio_wmb();
+		head_dp->flags = head_flags;
+	}
 }
 
 static inline void
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index fab87eb..7911c39 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -624,7 +624,7 @@ virtio_user_handle_ctrl_msg(struct virtio_user_dev *dev, struct vring *vring,
 static inline int
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = desc->flags;
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_PACKED_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_PACKED_DESC_F_USED);
@@ -684,6 +684,10 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 	struct vring_packed *vring = &dev->packed_vrings[queue_idx];
 	uint16_t n_descs, flags;
 
+	/* Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
+	 */
 	while (desc_is_avail(&vring->desc[vq->used_idx],
 			     vq->used_wrap_counter)) {
 
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa..d294ed1 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -344,7 +344,7 @@ vq_is_packed(struct virtio_net *dev)
 static inline bool
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = *((volatile uint16_t *) &desc->flags);
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_DESC_F_USED);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b83..e7463ff 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -503,14 +503,13 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (avail_idx < vq->last_avail_idx)
 		wrap_counter ^= 1;
 
-	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
-		return -1;
-
 	/*
-	 * The ordering between desc flags and desc
-	 * content reads need to be enforced.
+	 * Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
 	 */
-	rte_smp_rmb();
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
+		return -1;
 
 	*desc_count = 0;
 	*len = 0;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for packed vring desc used flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (3 preceding siblings ...)
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-09-06 11:34 ` Joyce Kong
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 30+ messages in thread
From: Joyce Kong @ 2019-09-06 11:34 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, amorenoz,
	xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
frontend and backend are assumed to be implemented in software,
that is they can run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other
than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers
== 1) and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++++---
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
 drivers/net/virtio/virtqueue.h                   |  7 ++++++-
 lib/librte_vhost/virtio_net.c                    |  5 ++---
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 2a2153c..1d818c8 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -122,9 +122,11 @@ virtqueue_dequeue_burst_rx_packed(struct virtqueue *vq,
 
 	for (i = 0; i < num; i++) {
 		used_idx = vq->vq_used_cons_idx;
+		/* desc_is_used has a load-acquire or rte_cio_rmb inside
+		 * and wait for used desc in virtqueue.
+		 */
 		if (!desc_is_used(&desc[used_idx], vq))
 			return i;
-		virtio_rmb(vq->hw->weak_barriers);
 		len[i] = desc[used_idx].len;
 		id = desc[used_idx].id;
 		cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
@@ -233,8 +235,10 @@ virtio_xmit_cleanup_inorder_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		do {
 			curr_id = used_idx;
@@ -265,8 +269,10 @@ virtio_xmit_cleanup_normal_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num-- && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		dxp = &vq->vq_descx[id];
 		vq->vq_used_cons_idx += dxp->ndescs;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 7911c39..1c575d0 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -698,8 +698,8 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 		if (vq->used_wrap_counter)
 			flags |= VRING_PACKED_DESC_F_AVAIL_USED;
 
-		rte_smp_wmb();
-		vring->desc[vq->used_idx].flags = flags;
+		__atomic_store_n(&vring->desc[vq->used_idx].flags, flags,
+				 __ATOMIC_RELEASE);
 
 		vq->used_idx += n_descs;
 		if (vq->used_idx >= dev->queue_size) {
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index c6dd4a3..ee6fcbb 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -286,7 +286,12 @@ desc_is_used(struct vring_packed_desc *desc, struct virtqueue *vq)
 {
 	uint16_t used, avail, flags;
 
-	flags = desc->flags;
+	if (vq->hw->weak_barriers)
+		flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
+	else {
+		flags = desc->flags;
+		rte_cio_rmb();
+	}
 	used = !!(flags & VRING_PACKED_DESC_F_USED);
 	avail = !!(flags & VRING_PACKED_DESC_F_AVAIL);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e7463ff..241d467 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -110,8 +110,6 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 			used_idx -= vq->size;
 	}
 
-	rte_smp_wmb();
-
 	for (i = 0; i < vq->shadow_used_idx; i++) {
 		uint16_t flags;
 
@@ -147,7 +145,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 		}
 	}
 
-	vq->desc_packed[head_idx].flags = head_flags;
+	__atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags,
+			 __ATOMIC_RELEASE);
 
 	vhost_log_cache_used_vring(dev, vq,
 				head_idx *
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-09-06 16:01   ` Maxime Coquelin
  2019-09-09  9:24     ` Joyce Kong (Arm Technology China)
  0 siblings, 1 reply; 30+ messages in thread
From: Maxime Coquelin @ 2019-09-06 16:01 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, tiwei.bie, zhihong.wang, amorenoz, xiao.w.wang, yong.liu,
	jfreimann, honnappa.nagarahalli, gavin.hu

Hi Joyce,

On 9/6/19 1:34 PM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration. Thus a weak form of memory
> barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for
> this case(vq->hw->weak_barriers == 1) and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers.
> 
> Meanwhile, a read barrier is required to ensure ordering between
> descriptor's flags and content reads[1]. With C11, load-acquire can
> enforce the ordering instead of rmb barrier.
> 
> [1]https://patchwork.dpdk.org/patch/49109/
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
>  drivers/net/virtio/virtio_rxtx.c                 | 26 ++++++++++++++++++------
>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
>  lib/librte_vhost/vhost.h                         |  2 +-
>  lib/librte_vhost/virtio_net.c                    | 11 +++++-----
>  4 files changed, 31 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index 27ead19..2a2153c 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -456,8 +456,14 @@ virtqueue_enqueue_recv_refill_packed(struct virtqueue *vq,
>  		vq->vq_desc_head_idx = dxp->next;
>  		if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
>  			vq->vq_desc_tail_idx = vq->vq_desc_head_idx;
> -		virtio_wmb(hw->weak_barriers);
> -		start_dp[idx].flags = flags;
> +
> +		if (hw->weak_barriers)
> +			__atomic_store_n(&start_dp[idx].flags, flags,
> +					 __ATOMIC_RELEASE);
> +		else {
> +			rte_cio_wmb();
> +			start_dp[idx].flags = flags;
> +		}
It looks good to me.
I just wonder whether it would be cleaner to put that in an inline
function:

static inline void
virtqueue_store_flags_packed()

Same for the fetch.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 0/2] virtio: one way barrier for packed vring flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (4 preceding siblings ...)
  2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
@ 2019-09-09  9:14 ` Joyce Kong
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 30+ messages in thread
From: Joyce Kong @ 2019-09-09  9:14 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, amorenoz,
	xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli, gavin.hu

This patch set replaces the two-way barriers with C11 one-way barriers
for packed vring flags, when the frontend and backend are implemented
in software.

By doing vhost-user + virtio-user case benchmarking, 9% performance gain
in the RFC2544 test was measured on Thunderx2 platform.[1]

[1]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
   PVP test with virtio 1.1 mergeable path

v3:
Wrap C11 one-way barriers and DMA barriers(rte_cio_*) together with an inline fuction.

v2:
Convert RFC to patch.

Joyce Kong (2):
  virtio: one way barrier for packed vring desc avail flags
  virtio: one way barrier for packed vring desc used flags

 drivers/net/virtio/virtio_rxtx.c                 | 25 +++++++++++++--------
 drivers/net/virtio/virtio_user/virtio_user_dev.c | 10 ++++++---
 drivers/net/virtio/virtqueue.h                   | 28 +++++++++++++++++++++++-
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 16 ++++++--------
 5 files changed, 58 insertions(+), 23 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (5 preceding siblings ...)
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 0/2] virtio: one way barrier for packed vring flags Joyce Kong
@ 2019-09-09  9:14 ` Joyce Kong
  2019-09-09 10:10   ` Maxime Coquelin
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Joyce Kong @ 2019-09-09  9:14 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, amorenoz,
	xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
flags in packed ring.

Meanwhile, a read barrier is required to ensure ordering between
descriptor's flags and content reads[1]. With C11, load-acquire can
enforce the ordering instead of rmb barrier.

[1]https://patchwork.dpdk.org/patch/49109/

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
 drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 11 +++++------
 5 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 27ead19..a87ffe1 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -456,8 +456,10 @@ virtqueue_enqueue_recv_refill_packed(struct virtqueue *vq,
 		vq->vq_desc_head_idx = dxp->next;
 		if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
 			vq->vq_desc_tail_idx = vq->vq_desc_head_idx;
-		virtio_wmb(hw->weak_barriers);
-		start_dp[idx].flags = flags;
+
+		virtqueue_store_flags_packed(&start_dp[idx], flags,
+					     hw->weak_barriers);
+
 		if (++vq->vq_avail_idx >= vq->vq_nentries) {
 			vq->vq_avail_idx -= vq->vq_nentries;
 			vq->vq_packed.cached_flags ^=
@@ -671,8 +673,7 @@ virtqueue_enqueue_xmit_packed_fast(struct virtnet_tx *txvq,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	dp->flags = flags;
+	virtqueue_store_flags_packed(dp, flags, vq->hw->weak_barriers);
 }
 
 static inline void
@@ -763,8 +764,8 @@ virtqueue_enqueue_xmit_packed(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	head_dp->flags = head_flags;
+	virtqueue_store_flags_packed(head_dp, head_flags,
+				     vq->hw->weak_barriers);
 }
 
 static inline void
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index fab87eb..7911c39 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -624,7 +624,7 @@ virtio_user_handle_ctrl_msg(struct virtio_user_dev *dev, struct vring *vring,
 static inline int
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = desc->flags;
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_PACKED_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_PACKED_DESC_F_USED);
@@ -684,6 +684,10 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 	struct vring_packed *vring = &dev->packed_vrings[queue_idx];
 	uint16_t n_descs, flags;
 
+	/* Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
+	 */
 	while (desc_is_avail(&vring->desc[vq->used_idx],
 			     vq->used_wrap_counter)) {
 
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index c6dd4a3..8d93ccb 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -54,6 +54,17 @@ virtio_wmb(uint8_t weak_barriers)
 		rte_cio_wmb();
 }
 
+static inline void
+virtqueue_store_flags_packed(struct vring_packed_desc *dp,
+			      uint16_t flags, uint8_t weak_barriers)
+{
+	if (weak_barriers) {
+		__atomic_store_n(&dp->flags, flags, __ATOMIC_RELEASE);
+	} else {
+		rte_cio_wmb();
+		dp->flags = flags;
+	}
+}
 #ifdef RTE_PMD_PACKET_PREFETCH
 #define rte_packet_prefetch(p)  rte_prefetch1(p)
 #else
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa..d294ed1 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -344,7 +344,7 @@ vq_is_packed(struct virtio_net *dev)
 static inline bool
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = *((volatile uint16_t *) &desc->flags);
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_DESC_F_USED);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b83..e7463ff 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -503,14 +503,13 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (avail_idx < vq->last_avail_idx)
 		wrap_counter ^= 1;
 
-	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
-		return -1;
-
 	/*
-	 * The ordering between desc flags and desc
-	 * content reads need to be enforced.
+	 * Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
 	 */
-	rte_smp_rmb();
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
+		return -1;
 
 	*desc_count = 0;
 	*len = 0;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for packed vring desc used flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (6 preceding siblings ...)
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-09-09  9:14 ` Joyce Kong
  2019-09-09 10:11   ` Maxime Coquelin
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Joyce Kong @ 2019-09-09  9:14 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, amorenoz,
	xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
flags in packed ring.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++++---
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
 drivers/net/virtio/virtqueue.h                   | 17 ++++++++++++++++-
 lib/librte_vhost/virtio_net.c                    |  5 ++---
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a87ffe1..2f0879c 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -122,9 +122,11 @@ virtqueue_dequeue_burst_rx_packed(struct virtqueue *vq,
 
 	for (i = 0; i < num; i++) {
 		used_idx = vq->vq_used_cons_idx;
+		/* desc_is_used has a load-acquire or rte_cio_rmb inside
+		 * and wait for used desc in virtqueue.
+		 */
 		if (!desc_is_used(&desc[used_idx], vq))
 			return i;
-		virtio_rmb(vq->hw->weak_barriers);
 		len[i] = desc[used_idx].len;
 		id = desc[used_idx].id;
 		cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
@@ -233,8 +235,10 @@ virtio_xmit_cleanup_inorder_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		do {
 			curr_id = used_idx;
@@ -265,8 +269,10 @@ virtio_xmit_cleanup_normal_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num-- && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		dxp = &vq->vq_descx[id];
 		vq->vq_used_cons_idx += dxp->ndescs;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 7911c39..1c575d0 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -698,8 +698,8 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 		if (vq->used_wrap_counter)
 			flags |= VRING_PACKED_DESC_F_AVAIL_USED;
 
-		rte_smp_wmb();
-		vring->desc[vq->used_idx].flags = flags;
+		__atomic_store_n(&vring->desc[vq->used_idx].flags, flags,
+				 __ATOMIC_RELEASE);
 
 		vq->used_idx += n_descs;
 		if (vq->used_idx >= dev->queue_size) {
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 8d93ccb..bac52e3 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -54,6 +54,21 @@ virtio_wmb(uint8_t weak_barriers)
 		rte_cio_wmb();
 }
 
+static inline uint16_t
+virtqueue_fetch_flags_packed(struct vring_packed_desc *dp,
+			      uint8_t weak_barriers)
+{
+	uint16_t flags;
+
+	if (weak_barriers) {
+		flags = __atomic_load_n(&dp->flags, __ATOMIC_ACQUIRE);
+	} else {
+		flags = dp->flags;
+		rte_cio_rmb();
+	}
+	return flags;
+}
+
 static inline void
 virtqueue_store_flags_packed(struct vring_packed_desc *dp,
 			      uint16_t flags, uint8_t weak_barriers)
@@ -297,7 +312,7 @@ desc_is_used(struct vring_packed_desc *desc, struct virtqueue *vq)
 {
 	uint16_t used, avail, flags;
 
-	flags = desc->flags;
+	flags = virtqueue_fetch_flags_packed(desc, vq->hw->weak_barriers);
 	used = !!(flags & VRING_PACKED_DESC_F_USED);
 	avail = !!(flags & VRING_PACKED_DESC_F_AVAIL);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e7463ff..241d467 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -110,8 +110,6 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 			used_idx -= vq->size;
 	}
 
-	rte_smp_wmb();
-
 	for (i = 0; i < vq->shadow_used_idx; i++) {
 		uint16_t flags;
 
@@ -147,7 +145,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 		}
 	}
 
-	vq->desc_packed[head_idx].flags = head_flags;
+	__atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags,
+			 __ATOMIC_RELEASE);
 
 	vhost_log_cache_used_vring(dev, vq,
 				head_idx *
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-06 16:01   ` Maxime Coquelin
@ 2019-09-09  9:24     ` Joyce Kong (Arm Technology China)
  0 siblings, 0 replies; 30+ messages in thread
From: Joyce Kong (Arm Technology China) @ 2019-09-09  9:24 UTC (permalink / raw)
  To: Maxime Coquelin, dev
  Cc: nd, tiwei.bie, zhihong.wang, amorenoz, xiao.w.wang, yong.liu,
	jfreimann, Honnappa Nagarahalli, Gavin Hu (Arm Technology China)

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Saturday, September 7, 2019 12:02 AM
> To: Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>;
> dev@dpdk.org
> Cc: nd <nd@arm.com>; tiwei.bie@intel.com; zhihong.wang@intel.com;
> amorenoz@redhat.com; xiao.w.wang@intel.com; yong.liu@intel.com;
> jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: Re: [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail
> flags
> 
> Hi Joyce,
> 
> On 9/6/19 1:34 PM, Joyce Kong wrote:
> > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> > frontend and backend are assumed to be implemented in software, that
> > is they can run on identical CPUs in an SMP configuration. Thus a weak
> > form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb,
> > is sufficient for this case(vq->hw->weak_barriers == 1) and yields better
> performance.
> > For the above case, this patch helps yielding even better performance
> > by replacing the two-way barriers with C11 one-way barriers.
> >
> > Meanwhile, a read barrier is required to ensure ordering between
> > descriptor's flags and content reads[1]. With C11, load-acquire can
> > enforce the ordering instead of rmb barrier.
> >
> > [1]https://patchwork.dpdk.org/patch/49109/
> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > ---
> >  drivers/net/virtio/virtio_rxtx.c                 | 26 ++++++++++++++++++------
> >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> >  lib/librte_vhost/vhost.h                         |  2 +-
> >  lib/librte_vhost/virtio_net.c                    | 11 +++++-----
> >  4 files changed, 31 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/net/virtio/virtio_rxtx.c
> > b/drivers/net/virtio/virtio_rxtx.c
> > index 27ead19..2a2153c 100644
> > --- a/drivers/net/virtio/virtio_rxtx.c
> > +++ b/drivers/net/virtio/virtio_rxtx.c
> > @@ -456,8 +456,14 @@ virtqueue_enqueue_recv_refill_packed(struct
> virtqueue *vq,
> >  		vq->vq_desc_head_idx = dxp->next;
> >  		if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
> >  			vq->vq_desc_tail_idx = vq->vq_desc_head_idx;
> > -		virtio_wmb(hw->weak_barriers);
> > -		start_dp[idx].flags = flags;
> > +
> > +		if (hw->weak_barriers)
> > +			__atomic_store_n(&start_dp[idx].flags, flags,
> > +					 __ATOMIC_RELEASE);
> > +		else {
> > +			rte_cio_wmb();
> > +			start_dp[idx].flags = flags;
> > +		}
> It looks good to me.
> I just wonder whether it would be cleaner to put that in an inline
> function:
> 
> static inline void
> virtqueue_store_flags_packed()
> 
> Same for the fetch.

Have wrapped the store/fetch operation in inline functions in v3.

Best Regards,
Joyce


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-09-09 10:10   ` Maxime Coquelin
  2019-09-10  3:54     ` Wang, Yinan
  0 siblings, 1 reply; 30+ messages in thread
From: Maxime Coquelin @ 2019-09-09 10:10 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, tiwei.bie, zhihong.wang, amorenoz, xiao.w.wang, yong.liu,
	jfreimann, honnappa.nagarahalli, gavin.hu



On 9/9/19 11:14 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for avail
> flags in packed ring.
> 
> Meanwhile, a read barrier is required to ensure ordering between
> descriptor's flags and content reads[1]. With C11, load-acquire can
> enforce the ordering instead of rmb barrier.
> 
> [1]https://patchwork.dpdk.org/patch/49109/
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
>  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
>  drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
>  lib/librte_vhost/vhost.h                         |  2 +-
>  lib/librte_vhost/virtio_net.c                    | 11 +++++------
>  5 files changed, 29 insertions(+), 14 deletions(-)

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for packed vring desc used flags
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
@ 2019-09-09 10:11   ` Maxime Coquelin
  0 siblings, 0 replies; 30+ messages in thread
From: Maxime Coquelin @ 2019-09-09 10:11 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, tiwei.bie, zhihong.wang, amorenoz, xiao.w.wang, yong.liu,
	jfreimann, honnappa.nagarahalli, gavin.hu



On 9/9/19 11:14 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for used
> flags in packed ring.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
>  drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++++---
>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
>  drivers/net/virtio/virtqueue.h                   | 17 ++++++++++++++++-
>  lib/librte_vhost/virtio_net.c                    |  5 ++---
>  4 files changed, 29 insertions(+), 9 deletions(-)

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-09 10:10   ` Maxime Coquelin
@ 2019-09-10  3:54     ` Wang, Yinan
  2019-09-10  9:48       ` Gavin Hu (Arm Technology China)
  0 siblings, 1 reply; 30+ messages in thread
From: Wang, Yinan @ 2019-09-10  3:54 UTC (permalink / raw)
  To: Maxime Coquelin, Joyce Kong, dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang, Xiao W, Liu, Yong,
	jfreimann, honnappa.nagarahalli, gavin.hu


Hi Joyce,

I just test performance impact of your patch set with code base commit id: d03d8622db48918d14bfe805641b1766ecc40088, after applying your v3 patch set , seven paths of vhost/virtio pvp test shows performance drop as below:

PVP vhost/virtio 1c1q test	         before apply patch	apply patch
test_perf_pvp_inorder_mergeable     	 7.603	           7.474
test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
test_perf_pvp_mergeable	              7.556	           7.431
test_perf_pvp_normal	                   7.554	           7.478
test_perf_pvp_vector_rx	               7.581	           7.469
test_perf_pvp_virtio11_mergeable	           7.068	           6.905
test_perf_pvp_virtio11_normal	           7.088	           6.888

Thanks,
Yinan

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
> Sent: 2019年9月9日 18:10
> To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
> Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> jfreimann@redhat.com; honnappa.nagarahalli@arm.com; gavin.hu@arm.com
> Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
> desc avail flags
> 
> 
> 
> On 9/9/19 11:14 AM, Joyce Kong wrote:
> > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> > frontend and backend are assumed to be implemented in software, that
> > is they can run on identical CPUs in an SMP configuration.
> > Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> > and yields better performance.
> > For the above case, this patch helps yielding even better performance
> > by replacing the two-way barriers with C11 one-way barriers for avail
> > flags in packed ring.
> >
> > Meanwhile, a read barrier is required to ensure ordering between
> > descriptor's flags and content reads[1]. With C11, load-acquire can
> > enforce the ordering instead of rmb barrier.
> >
> > [1]https://patchwork.dpdk.org/patch/49109/
> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > ---
> >  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
> >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> >  drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
> >  lib/librte_vhost/vhost.h                         |  2 +-
> >  lib/librte_vhost/virtio_net.c                    | 11 +++++------
> >  5 files changed, 29 insertions(+), 14 deletions(-)
> 
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> 
> Thanks,
> Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-10  3:54     ` Wang, Yinan
@ 2019-09-10  9:48       ` Gavin Hu (Arm Technology China)
  2019-09-10 10:17         ` Maxime Coquelin
  2019-09-11  2:39         ` Liu, Yong
  0 siblings, 2 replies; 30+ messages in thread
From: Gavin Hu (Arm Technology China) @ 2019-09-10  9:48 UTC (permalink / raw)
  To: Wang, Yinan, Maxime Coquelin, Joyce Kong (Arm Technology China), dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang,  Xiao W, Liu,
	Yong, jfreimann, Honnappa Nagarahalli, Steve Capper

Hi Yinan,

We have done a comparative analysis and found with the old code the if(weak_barriers) and else branches were saved on x86 as rte_smp_wmb and rte_cio_wmb are identical.  
http://git.dpdk.org/dpdk/tree/drivers/net/virtio/virtqueue.h#n49 
For the new code, with Joyce's patches applied, the branches were not saved, which requir additional cpu cycles, this caused slight degradation on x86.

The patches uplifted the performance on aarch64 about 9% as indicated in the cover letter. While I am thinking over a solution to the degradation on x86,could you help answer:
1. Is rte_cio_wmb is sufficient for the non weak-barrier case(HW offloading)?
 I got this question because I see in Intel NIC PMDs, it is almost never used, it is rte_wmb that is more widely used to notify the NIC device, any difference between the virtio ring compatible smartNIC device(or vDPA?) and i40e like devices? 
2. If the rte_cio_wmb is not sufficient for this case and replaced by stronger barriers, like sfence,  then the branches will not be saved by the compiler, then the problem becomes with the correct use of barriers, other than the degradation.

Any comments are welcome!

Best Regards,
Gavin

> -----Original Message-----
> From: Wang, Yinan <yinan.wang@intel.com>
> Sent: Tuesday, September 10, 2019 11:54 AM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
> desc avail flags
> 
> 
> Hi Joyce,
> 
> I just test performance impact of your patch set with code base commit id:
> d03d8622db48918d14bfe805641b1766ecc40088, after applying your v3 patch
> set , seven paths of vhost/virtio pvp test shows performance drop as below:
> 
> PVP vhost/virtio 1c1q test	         before apply patch	apply patch
> test_perf_pvp_inorder_mergeable     	 7.603	           7.474
> test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
> test_perf_pvp_mergeable	              7.556	           7.431
> test_perf_pvp_normal	                   7.554	           7.478
> test_perf_pvp_vector_rx	               7.581	           7.469
> test_perf_pvp_virtio11_mergeable	           7.068	           6.905
> test_perf_pvp_virtio11_normal	           7.088	           6.888
> 
> Thanks,
> Yinan
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
> > Sent: 2019年9月9日 18:10
> > To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
> > Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
> gavin.hu@arm.com
> > Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> vring
> > desc avail flags
> >
> >
> >
> > On 9/9/19 11:14 AM, Joyce Kong wrote:
> > > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> > > frontend and backend are assumed to be implemented in software, that
> > > is they can run on identical CPUs in an SMP configuration.
> > > Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> > > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> > > and yields better performance.
> > > For the above case, this patch helps yielding even better performance
> > > by replacing the two-way barriers with C11 one-way barriers for avail
> > > flags in packed ring.
> > >
> > > Meanwhile, a read barrier is required to ensure ordering between
> > > descriptor's flags and content reads[1]. With C11, load-acquire can
> > > enforce the ordering instead of rmb barrier.
> > >
> > > [1]https://patchwork.dpdk.org/patch/49109/
> > >
> > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > ---
> > >  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
> > >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> > >  drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
> > >  lib/librte_vhost/vhost.h                         |  2 +-
> > >  lib/librte_vhost/virtio_net.c                    | 11 +++++------
> > >  5 files changed, 29 insertions(+), 14 deletions(-)
> >
> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> >
> > Thanks,
> > Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-10  9:48       ` Gavin Hu (Arm Technology China)
@ 2019-09-10 10:17         ` Maxime Coquelin
  2019-09-11  2:39         ` Liu, Yong
  1 sibling, 0 replies; 30+ messages in thread
From: Maxime Coquelin @ 2019-09-10 10:17 UTC (permalink / raw)
  To: Gavin Hu (Arm Technology China),
	Wang, Yinan, Joyce Kong (Arm Technology China),
	dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang, Xiao W, Liu, Yong,
	jfreimann, Honnappa Nagarahalli, Steve Capper

Thanks Yinan for reporting the regresion and Gavin for the analysis.

On 9/10/19 11:48 AM, Gavin Hu (Arm Technology China) wrote:
> Hi Yinan,
> 
> We have done a comparative analysis and found with the old code the if(weak_barriers) and else branches were saved on x86 as rte_smp_wmb and rte_cio_wmb are identical.  
> http://git.dpdk.org/dpdk/tree/drivers/net/virtio/virtqueue.h#n49 
> For the new code, with Joyce's patches applied, the branches were not saved, which requir additional cpu cycles, this caused slight degradation on x86.
> 
> The patches uplifted the performance on aarch64 about 9% as indicated in the cover letter. While I am thinking over a solution to the degradation on x86,could you help answer:
> 1. Is rte_cio_wmb is sufficient for the non weak-barrier case(HW offloading)?
>  I got this question because I see in Intel NIC PMDs, it is almost never used, it is rte_wmb that is more widely used to notify the NIC device, any difference between the virtio ring compatible smartNIC device(or vDPA?) and i40e like devices? 
> 2. If the rte_cio_wmb is not sufficient for this case and replaced by stronger barriers, like sfence,  then the branches will not be saved by the compiler, then the problem becomes with the correct use of barriers, other than the degradation.
> 
> Any comments are welcome!

It may we worth that Yinan tries with rte_wmb instead of rte_cio_wmb
without the series applied, just to confirm this is caused by the etra
branch.

Maxime

> Best Regards,
> Gavin
> 
>> -----Original Message-----
>> From: Wang, Yinan <yinan.wang@intel.com>
>> Sent: Tuesday, September 10, 2019 11:54 AM
>> To: Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
>> Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
>> Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
>> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
>> <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
>> jfreimann@redhat.com; Honnappa Nagarahalli
>> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
>> <Gavin.Hu@arm.com>
>> Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
>> desc avail flags
>>
>>
>> Hi Joyce,
>>
>> I just test performance impact of your patch set with code base commit id:
>> d03d8622db48918d14bfe805641b1766ecc40088, after applying your v3 patch
>> set , seven paths of vhost/virtio pvp test shows performance drop as below:
>>
>> PVP vhost/virtio 1c1q test	         before apply patch	apply patch
>> test_perf_pvp_inorder_mergeable     	 7.603	           7.474
>> test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
>> test_perf_pvp_mergeable	              7.556	           7.431
>> test_perf_pvp_normal	                   7.554	           7.478
>> test_perf_pvp_vector_rx	               7.581	           7.469
>> test_perf_pvp_virtio11_mergeable	           7.068	           6.905
>> test_perf_pvp_virtio11_normal	           7.088	           6.888
>>
>> Thanks,
>> Yinan
>>
>>> -----Original Message-----
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
>>> Sent: 2019年9月9日 18:10
>>> To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
>>> Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
>>> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
>>> <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
>>> jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
>> gavin.hu@arm.com
>>> Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
>> vring
>>> desc avail flags
>>>
>>>
>>>
>>> On 9/9/19 11:14 AM, Joyce Kong wrote:
>>>> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
>>>> frontend and backend are assumed to be implemented in software, that
>>>> is they can run on identical CPUs in an SMP configuration.
>>>> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
>>>> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
>>>> and yields better performance.
>>>> For the above case, this patch helps yielding even better performance
>>>> by replacing the two-way barriers with C11 one-way barriers for avail
>>>> flags in packed ring.
>>>>
>>>> Meanwhile, a read barrier is required to ensure ordering between
>>>> descriptor's flags and content reads[1]. With C11, load-acquire can
>>>> enforce the ordering instead of rmb barrier.
>>>>
>>>> [1]https://patchwork.dpdk.org/patch/49109/
>>>>
>>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>>> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>>>> Reviewed-by: Phil Yang <phil.yang@arm.com>
>>>> ---
>>>>  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
>>>>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
>>>>  drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
>>>>  lib/librte_vhost/vhost.h                         |  2 +-
>>>>  lib/librte_vhost/virtio_net.c                    | 11 +++++------
>>>>  5 files changed, 29 insertions(+), 14 deletions(-)
>>>
>>> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>
>>> Thanks,
>>> Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-10  9:48       ` Gavin Hu (Arm Technology China)
  2019-09-10 10:17         ` Maxime Coquelin
@ 2019-09-11  2:39         ` Liu, Yong
  2019-09-11  3:35           ` Gavin Hu (Arm Technology China)
  1 sibling, 1 reply; 30+ messages in thread
From: Liu, Yong @ 2019-09-11  2:39 UTC (permalink / raw)
  To: Gavin Hu (Arm Technology China),
	Wang, Yinan, Maxime Coquelin, Joyce Kong (Arm Technology China),
	dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang, Xiao W, jfreimann,
	Honnappa Nagarahalli, Steve Capper



> -----Original Message-----
> From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> Sent: Tuesday, September 10, 2019 5:49 PM
> To: Wang, Yinan <yinan.wang@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; Joyce Kong (Arm Technology China)
> <Joyce.Kong@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> jfreimann@redhat.com; Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> vring desc avail flags
> 
> Hi Yinan,
> 
> We have done a comparative analysis and found with the old code the
> if(weak_barriers) and else branches were saved on x86 as rte_smp_wmb and
> rte_cio_wmb are identical.
> http://git.dpdk.org/dpdk/tree/drivers/net/virtio/virtqueue.h#n49
> For the new code, with Joyce's patches applied, the branches were not saved,
> which requir additional cpu cycles, this caused slight degradation on x86.
> 
> The patches uplifted the performance on aarch64 about 9% as indicated in
> the cover letter. While I am thinking over a solution to the degradation on
> x86,could you help answer:
> 1. Is rte_cio_wmb is sufficient for the non weak-barrier case(HW
> offloading)?
>  I got this question because I see in Intel NIC PMDs, it is almost never
> used, it is rte_wmb that is more widely used to notify the NIC device, any
> difference between the virtio ring compatible smartNIC device(or vDPA?) and
> i40e like devices?

Hi Gavin,
X86 architecture can guarantee that young store happen later than old store.
So rte_cio_wmb is just compiler memory barrier in x86. 

I think compiler barrier is also enough in pmd, rte_wmb is in pmd because of it was inherit from first implementation :)

Thanks,
Marvin

> 2. If the rte_cio_wmb is not sufficient for this case and replaced by
> stronger barriers, like sfence,  then the branches will not be saved by the
> compiler, then the problem becomes with the correct use of barriers, other
> than the degradation.
> 
> Any comments are welcome!
> 
> Best Regards,
> Gavin
> 
> > -----Original Message-----
> > From: Wang, Yinan <yinan.wang@intel.com>
> > Sent: Tuesday, September 10, 2019 11:54 AM
> > To: Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> > Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > jfreimann@redhat.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > <Gavin.Hu@arm.com>
> > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> vring
> > desc avail flags
> >
> >
> > Hi Joyce,
> >
> > I just test performance impact of your patch set with code base commit id:
> > d03d8622db48918d14bfe805641b1766ecc40088, after applying your v3 patch
> > set , seven paths of vhost/virtio pvp test shows performance drop as
> below:
> >
> > PVP vhost/virtio 1c1q test	         before apply patch	apply patch
> > test_perf_pvp_inorder_mergeable     	 7.603	           7.474
> > test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
> > test_perf_pvp_mergeable	              7.556	           7.431
> > test_perf_pvp_normal	                   7.554	           7.478
> > test_perf_pvp_vector_rx	               7.581	           7.469
> > test_perf_pvp_virtio11_mergeable	           7.068	           6.905
> > test_perf_pvp_virtio11_normal	           7.088	           6.888
> >
> > Thanks,
> > Yinan
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
> > > Sent: 2019年9月9日 18:10
> > > To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
> > > Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
> > gavin.hu@arm.com
> > > Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> packed
> > vring
> > > desc avail flags
> > >
> > >
> > >
> > > On 9/9/19 11:14 AM, Joyce Kong wrote:
> > > > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> > > > frontend and backend are assumed to be implemented in software, that
> > > > is they can run on identical CPUs in an SMP configuration.
> > > > Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> > > > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> > > > and yields better performance.
> > > > For the above case, this patch helps yielding even better performance
> > > > by replacing the two-way barriers with C11 one-way barriers for avail
> > > > flags in packed ring.
> > > >
> > > > Meanwhile, a read barrier is required to ensure ordering between
> > > > descriptor's flags and content reads[1]. With C11, load-acquire can
> > > > enforce the ordering instead of rmb barrier.
> > > >
> > > > [1]https://patchwork.dpdk.org/patch/49109/
> > > >
> > > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > ---
> > > >  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
> > > >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> > > >  drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
> > > >  lib/librte_vhost/vhost.h                         |  2 +-
> > > >  lib/librte_vhost/virtio_net.c                    | 11 +++++------
> > > >  5 files changed, 29 insertions(+), 14 deletions(-)
> > >
> > > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > >
> > > Thanks,
> > > Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-11  2:39         ` Liu, Yong
@ 2019-09-11  3:35           ` Gavin Hu (Arm Technology China)
  2019-09-11  6:29             ` Liu, Yong
  0 siblings, 1 reply; 30+ messages in thread
From: Gavin Hu (Arm Technology China) @ 2019-09-11  3:35 UTC (permalink / raw)
  To: Liu, Yong, Wang, Yinan, Maxime Coquelin,
	Joyce Kong (Arm Technology China),
	dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang,  Xiao W,
	jfreimann, Honnappa Nagarahalli, Steve Capper

Hi Marvin,

Thanks for your answers, one more question for x86:
1. For CIO memory alone or MMIO memory(eg PCI BAR) alone, the compiler barrier is enough to keep ordering, that's why both rte_io_mb and rte_cio_mb are defined as compiler barriers, right?
2. How about the ordering of interleaved CIO and MMIO accesses, for example, a young store to MMIO can be reordered before an older store to CIO? CIO may be faster than devices, but store buffers or caching may cause the CIO update not visible to the device(in a common doorbell case)? 

Best regards,
Gavin

> -----Original Message-----
> From: Liu, Yong <yong.liu@intel.com>
> Sent: Wednesday, September 11, 2019 10:39 AM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Wang, Yinan
> <yinan.wang@intel.com>; Maxime Coquelin <maxime.coquelin@redhat.com>;
> Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
> desc avail flags
> 
> 
> 
> > -----Original Message-----
> > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> > Sent: Tuesday, September 10, 2019 5:49 PM
> > To: Wang, Yinan <yinan.wang@intel.com>; Maxime Coquelin
> > <maxime.coquelin@redhat.com>; Joyce Kong (Arm Technology China)
> > <Joyce.Kong@arm.com>; dev@dpdk.org
> > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>;
> > Steve Capper <Steve.Capper@arm.com>
> > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> > vring desc avail flags
> >
> > Hi Yinan,
> >
> > We have done a comparative analysis and found with the old code the
> > if(weak_barriers) and else branches were saved on x86 as rte_smp_wmb
> and
> > rte_cio_wmb are identical.
> > http://git.dpdk.org/dpdk/tree/drivers/net/virtio/virtqueue.h#n49
> > For the new code, with Joyce's patches applied, the branches were not
> saved,
> > which requir additional cpu cycles, this caused slight degradation on x86.
> >
> > The patches uplifted the performance on aarch64 about 9% as indicated in
> > the cover letter. While I am thinking over a solution to the degradation on
> > x86,could you help answer:
> > 1. Is rte_cio_wmb is sufficient for the non weak-barrier case(HW
> > offloading)?
> >  I got this question because I see in Intel NIC PMDs, it is almost never
> > used, it is rte_wmb that is more widely used to notify the NIC device, any
> > difference between the virtio ring compatible smartNIC device(or vDPA?)
> and
> > i40e like devices?
> 
> Hi Gavin,
> X86 architecture can guarantee that young store happen later than old store.
> So rte_cio_wmb is just compiler memory barrier in x86.
> 
> I think compiler barrier is also enough in pmd, rte_wmb is in pmd because of
> it was inherit from first implementation :)
> 
> Thanks,
> Marvin
> 
> > 2. If the rte_cio_wmb is not sufficient for this case and replaced by
> > stronger barriers, like sfence,  then the branches will not be saved by the
> > compiler, then the problem becomes with the correct use of barriers, other
> > than the degradation.
> >
> > Any comments are welcome!
> >
> > Best Regards,
> > Gavin
> >
> > > -----Original Message-----
> > > From: Wang, Yinan <yinan.wang@intel.com>
> > > Sent: Tuesday, September 10, 2019 11:54 AM
> > > To: Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> > > Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > jfreimann@redhat.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > <Gavin.Hu@arm.com>
> > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> > vring
> > > desc avail flags
> > >
> > >
> > > Hi Joyce,
> > >
> > > I just test performance impact of your patch set with code base commit id:
> > > d03d8622db48918d14bfe805641b1766ecc40088, after applying your v3
> patch
> > > set , seven paths of vhost/virtio pvp test shows performance drop as
> > below:
> > >
> > > PVP vhost/virtio 1c1q test	         before apply patch	apply patch
> > > test_perf_pvp_inorder_mergeable     	 7.603	           7.474
> > > test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
> > > test_perf_pvp_mergeable	              7.556	           7.431
> > > test_perf_pvp_normal	                   7.554	           7.478
> > > test_perf_pvp_vector_rx	               7.581	           7.469
> > > test_perf_pvp_virtio11_mergeable	           7.068	           6.905
> > > test_perf_pvp_virtio11_normal	           7.088	           6.888
> > >
> > > Thanks,
> > > Yinan
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime
> Coquelin
> > > > Sent: 2019年9月9日 18:10
> > > > To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
> > > > Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > > jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
> > > gavin.hu@arm.com
> > > > Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> > packed
> > > vring
> > > > desc avail flags
> > > >
> > > >
> > > >
> > > > On 9/9/19 11:14 AM, Joyce Kong wrote:
> > > > > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> > > > > frontend and backend are assumed to be implemented in software,
> that
> > > > > is they can run on identical CPUs in an SMP configuration.
> > > > > Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> > > > > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> > > > > and yields better performance.
> > > > > For the above case, this patch helps yielding even better performance
> > > > > by replacing the two-way barriers with C11 one-way barriers for avail
> > > > > flags in packed ring.
> > > > >
> > > > > Meanwhile, a read barrier is required to ensure ordering between
> > > > > descriptor's flags and content reads[1]. With C11, load-acquire can
> > > > > enforce the ordering instead of rmb barrier.
> > > > >
> > > > > [1]https://patchwork.dpdk.org/patch/49109/
> > > > >
> > > > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > > ---
> > > > >  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
> > > > >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> > > > >  drivers/net/virtio/virtqueue.h                   | 11 +++++++++++
> > > > >  lib/librte_vhost/vhost.h                         |  2 +-
> > > > >  lib/librte_vhost/virtio_net.c                    | 11 +++++------
> > > > >  5 files changed, 29 insertions(+), 14 deletions(-)
> > > >
> > > > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > >
> > > > Thanks,
> > > > Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-11  3:35           ` Gavin Hu (Arm Technology China)
@ 2019-09-11  6:29             ` Liu, Yong
  2019-09-11  8:32               ` Gavin Hu (Arm Technology China)
  0 siblings, 1 reply; 30+ messages in thread
From: Liu, Yong @ 2019-09-11  6:29 UTC (permalink / raw)
  To: Gavin Hu (Arm Technology China),
	Wang, Yinan, Maxime Coquelin, Joyce Kong (Arm Technology China),
	dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang, Xiao W, jfreimann,
	Honnappa Nagarahalli, Steve Capper

Thanks Gavin, my answers are inline.

> -----Original Message-----
> From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> Sent: Wednesday, September 11, 2019 11:35 AM
> To: Liu, Yong <yong.liu@intel.com>; Wang, Yinan <yinan.wang@intel.com>;
> Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm Technology
> China) <Joyce.Kong@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> vring desc avail flags
> 
> Hi Marvin,
> 
> Thanks for your answers, one more question for x86:
> 1. For CIO memory alone or MMIO memory(eg PCI BAR) alone, the compiler
> barrier is enough to keep ordering, that's why both rte_io_mb and
> rte_cio_mb are defined as compiler barriers, right?

Yes, that's right for x86.

> 2. How about the ordering of interleaved CIO and MMIO accesses, for example,
> a young store to MMIO can be reordered before an older store to CIO? CIO
> may be faster than devices, but store buffers or caching may cause the CIO
> update not visible to the device(in a common doorbell case)?
> 

There's always one kind of cache coherent engine in x86 uncore sub-system.
When CIO write instruction was retried, data will be in CPU LLC.
When device doing inbound read, request will go to cache engine first and then check memory state and retrieve latest value.

> Best regards,
> Gavin
> 
> > -----Original Message-----
> > From: Liu, Yong <yong.liu@intel.com>
> > Sent: Wednesday, September 11, 2019 10:39 AM
> > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Wang, Yinan
> > <yinan.wang@intel.com>; Maxime Coquelin <maxime.coquelin@redhat.com>;
> > Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> vring
> > desc avail flags
> >
> >
> >
> > > -----Original Message-----
> > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> > > Sent: Tuesday, September 10, 2019 5:49 PM
> > > To: Wang, Yinan <yinan.wang@intel.com>; Maxime Coquelin
> > > <maxime.coquelin@redhat.com>; Joyce Kong (Arm Technology China)
> > > <Joyce.Kong@arm.com>; dev@dpdk.org
> > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > jfreimann@redhat.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>;
> > > Steve Capper <Steve.Capper@arm.com>
> > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> packed
> > > vring desc avail flags
> > >
> > > Hi Yinan,
> > >
> > > We have done a comparative analysis and found with the old code the
> > > if(weak_barriers) and else branches were saved on x86 as rte_smp_wmb
> > and
> > > rte_cio_wmb are identical.
> > > http://git.dpdk.org/dpdk/tree/drivers/net/virtio/virtqueue.h#n49
> > > For the new code, with Joyce's patches applied, the branches were not
> > saved,
> > > which requir additional cpu cycles, this caused slight degradation on
> x86.
> > >
> > > The patches uplifted the performance on aarch64 about 9% as indicated
> in
> > > the cover letter. While I am thinking over a solution to the
> degradation on
> > > x86,could you help answer:
> > > 1. Is rte_cio_wmb is sufficient for the non weak-barrier case(HW
> > > offloading)?
> > >  I got this question because I see in Intel NIC PMDs, it is almost
> never
> > > used, it is rte_wmb that is more widely used to notify the NIC device,
> any
> > > difference between the virtio ring compatible smartNIC device(or vDPA?)
> > and
> > > i40e like devices?
> >
> > Hi Gavin,
> > X86 architecture can guarantee that young store happen later than old
> store.
> > So rte_cio_wmb is just compiler memory barrier in x86.
> >
> > I think compiler barrier is also enough in pmd, rte_wmb is in pmd because
> of
> > it was inherit from first implementation :)
> >
> > Thanks,
> > Marvin
> >
> > > 2. If the rte_cio_wmb is not sufficient for this case and replaced by
> > > stronger barriers, like sfence,  then the branches will not be saved by
> the
> > > compiler, then the problem becomes with the correct use of barriers,
> other
> > > than the degradation.
> > >
> > > Any comments are welcome!
> > >
> > > Best Regards,
> > > Gavin
> > >
> > > > -----Original Message-----
> > > > From: Wang, Yinan <yinan.wang@intel.com>
> > > > Sent: Tuesday, September 10, 2019 11:54 AM
> > > > To: Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> > > > Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > > jfreimann@redhat.com; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > <Gavin.Hu@arm.com>
> > > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> packed
> > > vring
> > > > desc avail flags
> > > >
> > > >
> > > > Hi Joyce,
> > > >
> > > > I just test performance impact of your patch set with code base
> commit id:
> > > > d03d8622db48918d14bfe805641b1766ecc40088, after applying your v3
> > patch
> > > > set , seven paths of vhost/virtio pvp test shows performance drop as
> > > below:
> > > >
> > > > PVP vhost/virtio 1c1q test	         before apply patch	apply
> patch
> > > > test_perf_pvp_inorder_mergeable     	 7.603	           7.474
> > > > test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
> > > > test_perf_pvp_mergeable	              7.556	           7.431
> > > > test_perf_pvp_normal	                   7.554	           7.478
> > > > test_perf_pvp_vector_rx	               7.581	           7.469
> > > > test_perf_pvp_virtio11_mergeable	           7.068
> 6.905
> > > > test_perf_pvp_virtio11_normal	           7.088	           6.888
> > > >
> > > > Thanks,
> > > > Yinan
> > > >
> > > > > -----Original Message-----
> > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime
> > Coquelin
> > > > > Sent: 2019年9月9日 18:10
> > > > > To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
> > > > > Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > > > jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
> > > > gavin.hu@arm.com
> > > > > Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> > > packed
> > > > vring
> > > > > desc avail flags
> > > > >
> > > > >
> > > > >
> > > > > On 9/9/19 11:14 AM, Joyce Kong wrote:
> > > > > > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> > > > > > frontend and backend are assumed to be implemented in software,
> > that
> > > > > > is they can run on identical CPUs in an SMP configuration.
> > > > > > Thus a weak form of memory barriers like rte_smp_r/wmb, other
> than
> > > > > > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers
> == 1)
> > > > > > and yields better performance.
> > > > > > For the above case, this patch helps yielding even better
> performance
> > > > > > by replacing the two-way barriers with C11 one-way barriers for
> avail
> > > > > > flags in packed ring.
> > > > > >
> > > > > > Meanwhile, a read barrier is required to ensure ordering between
> > > > > > descriptor's flags and content reads[1]. With C11, load-acquire
> can
> > > > > > enforce the ordering instead of rmb barrier.
> > > > > >
> > > > > > [1]https://patchwork.dpdk.org/patch/49109/
> > > > > >
> > > > > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > > > ---
> > > > > >  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++---
> ---
> > > > > >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> > > > > >  drivers/net/virtio/virtqueue.h                   | 11
> +++++++++++
> > > > > >  lib/librte_vhost/vhost.h                         |  2 +-
> > > > > >  lib/librte_vhost/virtio_net.c                    | 11 +++++-----
> -
> > > > > >  5 files changed, 29 insertions(+), 14 deletions(-)
> > > > >
> > > > > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > >
> > > > > Thanks,
> > > > > Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-11  6:29             ` Liu, Yong
@ 2019-09-11  8:32               ` Gavin Hu (Arm Technology China)
  2019-09-11 10:02                 ` Bruce Richardson
  0 siblings, 1 reply; 30+ messages in thread
From: Gavin Hu (Arm Technology China) @ 2019-09-11  8:32 UTC (permalink / raw)
  To: Liu, Yong, Wang, Yinan, Maxime Coquelin,
	Joyce Kong (Arm Technology China),
	dev
  Cc: nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang,  Xiao W,
	jfreimann, Honnappa Nagarahalli, Steve Capper

Thanks Marvin, my inline comments.

> -----Original Message-----
> From: Liu, Yong <yong.liu@intel.com>
> Sent: Wednesday, September 11, 2019 2:30 PM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Wang, Yinan
> <yinan.wang@intel.com>; Maxime Coquelin <maxime.coquelin@redhat.com>;
> Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
> desc avail flags
> 
> Thanks Gavin, my answers are inline.
> 
> > -----Original Message-----
> > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> > Sent: Wednesday, September 11, 2019 11:35 AM
> > To: Liu, Yong <yong.liu@intel.com>; Wang, Yinan <yinan.wang@intel.com>;
> > Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> Technology
> > China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> > vring desc avail flags
> >
> > Hi Marvin,
> >
> > Thanks for your answers, one more question for x86:
> > 1. For CIO memory alone or MMIO memory(eg PCI BAR) alone, the compiler
> > barrier is enough to keep ordering, that's why both rte_io_mb and
> > rte_cio_mb are defined as compiler barriers, right?
> 
> Yes, that's right for x86.
> 
> > 2. How about the ordering of interleaved CIO and MMIO accesses, for
> example,
> > a young store to MMIO can be reordered before an older store to CIO? CIO
> > may be faster than devices, but store buffers or caching may cause the CIO
> > update not visible to the device(in a common doorbell case)?
> >
> 
> There's always one kind of cache coherent engine in x86 uncore sub-system.
> When CIO write instruction was retried, data will be in CPU LLC.
> When device doing inbound read, request will go to cache engine first and
> then check memory state and retrieve latest value.
I understand your words that the cache coherent engine is working like a hub/coordinator/arbiter for all the accesses to three types of memory: 1 - normal memory, 2 - CIO memory, 3 - MMIO memory, and the ordering behaviors are no different?   
Then in what scenarios mfence/sfence/lfence should be used?  Maybe just mfence is enough to keep orderings of store/load(which is the only one might reordered on x86)? 
> 
> > Best regards,
> > Gavin
> >
> > > -----Original Message-----
> > > From: Liu, Yong <yong.liu@intel.com>
> > > Sent: Wednesday, September 11, 2019 10:39 AM
> > > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Wang, Yinan
> > > <yinan.wang@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>;
> > > Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>;
> dev@dpdk.org
> > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; Steve Capper
> <Steve.Capper@arm.com>
> > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> > vring
> > > desc avail flags
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> > > > Sent: Tuesday, September 10, 2019 5:49 PM
> > > > To: Wang, Yinan <yinan.wang@intel.com>; Maxime Coquelin
> > > > <maxime.coquelin@redhat.com>; Joyce Kong (Arm Technology China)
> > > > <Joyce.Kong@arm.com>; dev@dpdk.org
> > > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang,
> Zhihong
> > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > > jfreimann@redhat.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>;
> > > > Steve Capper <Steve.Capper@arm.com>
> > > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> > packed
> > > > vring desc avail flags
> > > >
> > > > Hi Yinan,
> > > >
> > > > We have done a comparative analysis and found with the old code the
> > > > if(weak_barriers) and else branches were saved on x86 as rte_smp_wmb
> > > and
> > > > rte_cio_wmb are identical.
> > > > http://git.dpdk.org/dpdk/tree/drivers/net/virtio/virtqueue.h#n49
> > > > For the new code, with Joyce's patches applied, the branches were not
> > > saved,
> > > > which requir additional cpu cycles, this caused slight degradation on
> > x86.
> > > >
> > > > The patches uplifted the performance on aarch64 about 9% as indicated
> > in
> > > > the cover letter. While I am thinking over a solution to the
> > degradation on
> > > > x86,could you help answer:
> > > > 1. Is rte_cio_wmb is sufficient for the non weak-barrier case(HW
> > > > offloading)?
> > > >  I got this question because I see in Intel NIC PMDs, it is almost
> > never
> > > > used, it is rte_wmb that is more widely used to notify the NIC device,
> > any
> > > > difference between the virtio ring compatible smartNIC device(or vDPA?)
> > > and
> > > > i40e like devices?
> > >
> > > Hi Gavin,
> > > X86 architecture can guarantee that young store happen later than old
> > store.
> > > So rte_cio_wmb is just compiler memory barrier in x86.
> > >
> > > I think compiler barrier is also enough in pmd, rte_wmb is in pmd because
> > of
> > > it was inherit from first implementation :)
> > >
> > > Thanks,
> > > Marvin
> > >
> > > > 2. If the rte_cio_wmb is not sufficient for this case and replaced by
> > > > stronger barriers, like sfence,  then the branches will not be saved by
> > the
> > > > compiler, then the problem becomes with the correct use of barriers,
> > other
> > > > than the degradation.
> > > >
> > > > Any comments are welcome!
> > > >
> > > > Best Regards,
> > > > Gavin
> > > >
> > > > > -----Original Message-----
> > > > > From: Wang, Yinan <yinan.wang@intel.com>
> > > > > Sent: Tuesday, September 10, 2019 11:54 AM
> > > > > To: Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong
> (Arm
> > > > > Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > > > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang,
> Zhihong
> > > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > > > jfreimann@redhat.com; Honnappa Nagarahalli
> > > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > > <Gavin.Hu@arm.com>
> > > > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> > packed
> > > > vring
> > > > > desc avail flags
> > > > >
> > > > >
> > > > > Hi Joyce,
> > > > >
> > > > > I just test performance impact of your patch set with code base
> > commit id:
> > > > > d03d8622db48918d14bfe805641b1766ecc40088, after applying your
> v3
> > > patch
> > > > > set , seven paths of vhost/virtio pvp test shows performance drop as
> > > > below:
> > > > >
> > > > > PVP vhost/virtio 1c1q test	         before apply patch	apply
> > patch
> > > > > test_perf_pvp_inorder_mergeable     	 7.603	           7.474
> > > > > test_perf_pvp_inorder_no_mergeable	     7.642	           7.525
> > > > > test_perf_pvp_mergeable	              7.556	           7.431
> > > > > test_perf_pvp_normal	                   7.554	           7.478
> > > > > test_perf_pvp_vector_rx	               7.581	           7.469
> > > > > test_perf_pvp_virtio11_mergeable	           7.068
> > 6.905
> > > > > test_perf_pvp_virtio11_normal	           7.088	           6.888
> > > > >
> > > > > Thanks,
> > > > > Yinan
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime
> > > Coquelin
> > > > > > Sent: 2019年9月9日 18:10
> > > > > > To: Joyce Kong <joyce.kong@arm.com>; dev@dpdk.org
> > > > > > Cc: nd@arm.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > > > <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > > > > jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
> > > > > gavin.hu@arm.com
> > > > > > Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for
> > > > packed
> > > > > vring
> > > > > > desc avail flags
> > > > > >
> > > > > >
> > > > > >
> > > > > > On 9/9/19 11:14 AM, Joyce Kong wrote:
> > > > > > > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then
> the
> > > > > > > frontend and backend are assumed to be implemented in software,
> > > that
> > > > > > > is they can run on identical CPUs in an SMP configuration.
> > > > > > > Thus a weak form of memory barriers like rte_smp_r/wmb, other
> > than
> > > > > > > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers
> > == 1)
> > > > > > > and yields better performance.
> > > > > > > For the above case, this patch helps yielding even better
> > performance
> > > > > > > by replacing the two-way barriers with C11 one-way barriers for
> > avail
> > > > > > > flags in packed ring.
> > > > > > >
> > > > > > > Meanwhile, a read barrier is required to ensure ordering between
> > > > > > > descriptor's flags and content reads[1]. With C11, load-acquire
> > can
> > > > > > > enforce the ordering instead of rmb barrier.
> > > > > > >
> > > > > > > [1]https://patchwork.dpdk.org/patch/49109/
> > > > > > >
> > > > > > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > > > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > > > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > > > > ---
> > > > > > >  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++---
> > ---
> > > > > > >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
> > > > > > >  drivers/net/virtio/virtqueue.h                   | 11
> > +++++++++++
> > > > > > >  lib/librte_vhost/vhost.h                         |  2 +-
> > > > > > >  lib/librte_vhost/virtio_net.c                    | 11 +++++-----
> > -
> > > > > > >  5 files changed, 29 insertions(+), 14 deletions(-)
> > > > > >
> > > > > > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > >
> > > > > > Thanks,
> > > > > > Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-11  8:32               ` Gavin Hu (Arm Technology China)
@ 2019-09-11 10:02                 ` Bruce Richardson
  2019-09-12  8:21                   ` Gavin Hu (Arm Technology China)
  0 siblings, 1 reply; 30+ messages in thread
From: Bruce Richardson @ 2019-09-11 10:02 UTC (permalink / raw)
  To: Gavin Hu (Arm Technology China)
  Cc: Liu, Yong, Wang, Yinan, Maxime Coquelin,
	Joyce Kong (Arm Technology China),
	dev, nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang,  Xiao W,
	jfreimann, Honnappa Nagarahalli, Steve Capper

On Wed, Sep 11, 2019 at 08:32:16AM +0000, Gavin Hu (Arm Technology China) wrote:
> Thanks Marvin, my inline comments.
> 
> > -----Original Message-----
> > From: Liu, Yong <yong.liu@intel.com>
> > Sent: Wednesday, September 11, 2019 2:30 PM
> > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Wang, Yinan
> > <yinan.wang@intel.com>; Maxime Coquelin <maxime.coquelin@redhat.com>;
> > Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
> > desc avail flags
> > 
> > Thanks Gavin, my answers are inline.
> > 
> > > -----Original Message-----
> > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> > > Sent: Wednesday, September 11, 2019 11:35 AM
> > > To: Liu, Yong <yong.liu@intel.com>; Wang, Yinan <yinan.wang@intel.com>;
> > > Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> > Technology
> > > China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> > > vring desc avail flags
> > >
> > > Hi Marvin,
> > >
> > > Thanks for your answers, one more question for x86:
> > > 1. For CIO memory alone or MMIO memory(eg PCI BAR) alone, the compiler
> > > barrier is enough to keep ordering, that's why both rte_io_mb and
> > > rte_cio_mb are defined as compiler barriers, right?
> > 
> > Yes, that's right for x86.
> > 
> > > 2. How about the ordering of interleaved CIO and MMIO accesses, for
> > example,
> > > a young store to MMIO can be reordered before an older store to CIO? CIO
> > > may be faster than devices, but store buffers or caching may cause the CIO
> > > update not visible to the device(in a common doorbell case)?
> > >
> > 
> > There's always one kind of cache coherent engine in x86 uncore sub-system.
> > When CIO write instruction was retried, data will be in CPU LLC.
> > When device doing inbound read, request will go to cache engine first and
> > then check memory state and retrieve latest value.
> I understand your words that the cache coherent engine is working like a hub/coordinator/arbiter for all the accesses to three types of memory: 1 - normal memory, 2 - CIO memory, 3 - MMIO memory, and the ordering behaviors are no different?   
> Then in what scenarios mfence/sfence/lfence should be used?  Maybe just mfence is enough to keep orderings of store/load(which is the only one might reordered on x86)? 
> > 

The fence types needed will depend on the memory types used, for example,
any memory mapped as write-combining will have different behaviour and
need different fences to the regular write-back memory we are most familiar
with. For the situations we deal with in DPDK, for regular memory writes
and MMIO writes, reads won't be reordered with other reads, and writes
won't be reordered with other writes, so therefore, as you point out, the
mfence instruction is only rarely needed, and barriers to prevent compiler
reordering are sufficient in nearly all cases.

/Bruce

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-11 10:02                 ` Bruce Richardson
@ 2019-09-12  8:21                   ` Gavin Hu (Arm Technology China)
  0 siblings, 0 replies; 30+ messages in thread
From: Gavin Hu (Arm Technology China) @ 2019-09-12  8:21 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Liu, Yong, Wang, Yinan, Maxime Coquelin,
	Joyce Kong (Arm Technology China),
	dev, nd, Bie, Tiwei, Wang, Zhihong, amorenoz, Wang,  Xiao W,
	jfreimann, Honnappa Nagarahalli, Steve Capper

Hi Bruce,
> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Wednesday, September 11, 2019 6:03 PM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>
> Cc: Liu, Yong <yong.liu@intel.com>; Wang, Yinan <yinan.wang@intel.com>;
> Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> Technology China) <Joyce.Kong@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring
> desc avail flags
> 
> On Wed, Sep 11, 2019 at 08:32:16AM +0000, Gavin Hu (Arm Technology China)
> wrote:
> > Thanks Marvin, my inline comments.
> >
> > > -----Original Message-----
> > > From: Liu, Yong <yong.liu@intel.com>
> > > Sent: Wednesday, September 11, 2019 2:30 PM
> > > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Wang, Yinan
> > > <yinan.wang@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>;
> > > Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>;
> dev@dpdk.org
> > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; Steve Capper
> <Steve.Capper@arm.com>
> > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> vring
> > > desc avail flags
> > >
> > > Thanks Gavin, my answers are inline.
> > >
> > > > -----Original Message-----
> > > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> > > > Sent: Wednesday, September 11, 2019 11:35 AM
> > > > To: Liu, Yong <yong.liu@intel.com>; Wang, Yinan
> <yinan.wang@intel.com>;
> > > > Maxime Coquelin <maxime.coquelin@redhat.com>; Joyce Kong (Arm
> > > Technology
> > > > China) <Joyce.Kong@arm.com>; dev@dpdk.org
> > > > Cc: nd <nd@arm.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang,
> Zhihong
> > > > <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> > > > <xiao.w.wang@intel.com>; jfreimann@redhat.com; Honnappa
> Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; Steve Capper
> <Steve.Capper@arm.com>
> > > > Subject: RE: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed
> > > > vring desc avail flags
> > > >
> > > > Hi Marvin,
> > > >
> > > > Thanks for your answers, one more question for x86:
> > > > 1. For CIO memory alone or MMIO memory(eg PCI BAR) alone, the
> compiler
> > > > barrier is enough to keep ordering, that's why both rte_io_mb and
> > > > rte_cio_mb are defined as compiler barriers, right?
> > >
> > > Yes, that's right for x86.
> > >
> > > > 2. How about the ordering of interleaved CIO and MMIO accesses, for
> > > example,
> > > > a young store to MMIO can be reordered before an older store to CIO?
> CIO
> > > > may be faster than devices, but store buffers or caching may cause the
> CIO
> > > > update not visible to the device(in a common doorbell case)?
> > > >
> > >
> > > There's always one kind of cache coherent engine in x86 uncore sub-
> system.
> > > When CIO write instruction was retried, data will be in CPU LLC.
> > > When device doing inbound read, request will go to cache engine first and
> > > then check memory state and retrieve latest value.
> > I understand your words that the cache coherent engine is working like a
> hub/coordinator/arbiter for all the accesses to three types of memory: 1 -
> normal memory, 2 - CIO memory, 3 - MMIO memory, and the ordering
> behaviors are no different?
> > Then in what scenarios mfence/sfence/lfence should be used?  Maybe just
> mfence is enough to keep orderings of store/load(which is the only one might
> reordered on x86)?
> > >
> 
> The fence types needed will depend on the memory types used, for example,
> any memory mapped as write-combining will have different behaviour and
> need different fences to the regular write-back memory we are most familiar
> with. For the situations we deal with in DPDK, for regular memory writes
> and MMIO writes, reads won't be reordered with other reads, and writes
> won't be reordered with other writes, so therefore, as you point out, the
> mfence instruction is only rarely needed, and barriers to prevent compiler
> reordering are sufficient in nearly all cases.
Thanks for your explanation about the barriers on x86, it is really helpful for us
to optimize PMDs for aarch64 by using less restrictive barriers while not breaking x86 platforms.

> 
> /Bruce

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 0/2] virtio: one way barrier for packed vring flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (7 preceding siblings ...)
  2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
@ 2019-09-17  5:28 ` Joyce Kong
  2019-10-16 11:07   ` Maxime Coquelin
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
  10 siblings, 1 reply; 30+ messages in thread
From: Joyce Kong @ 2019-09-17  5:28 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, yinan.wang, bruce.richardson, tiwei.bie,
	zhihong.wang, amorenoz, xiao.w.wang, yong.liu, jfreimann,
	honnappa.nagarahalli, gavin.hu

This patch set replaces the two-way barriers with C11 one-way barriers
for packed vring flags, when the frontend and backend are implemented
in software.

By doing vhost-user + virtio-user case benchmarking, 9% performance gain
in the RFC2544 test was measured on Thunderx2 platform.[1] And by doing
VM2VM case benchmarking, 11% perf gain was measured on Ampere platform.

[1]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
   PVP test with virtio 1.1 mergeable path

v4:
Use rte_smp_rmb/wmb instead of __atomic_load/store_n on x86 as it reports a
better perf(~1.5%), which comes from the saved branch by the compiler. The
if and else branch are identical with the smp and cio barriers both defined
as compiler barriers on x86.
http://inbox.dpdk.org/dev/E0CBA5A1980F1F408E1F28F9991B5B1D50EFF246@SHSMSX104.ccr.corp.intel.com/

v3:
Wrap C11 one-way barriers and DMA barriers(rte_cio_*) together with an inline fuction.

v2:
Convert RFC to patch.

Joyce Kong (2):
  virtio: one way barrier for packed vring desc avail flags
  virtio: one way barrier for packed vring desc used flags

 drivers/net/virtio/virtio_rxtx.c                 | 25 +++++++-----
 drivers/net/virtio/virtio_user/virtio_user_dev.c | 10 +++--
 drivers/net/virtio/virtqueue.h                   | 49 +++++++++++++++++++++++-
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 16 ++++----
 5 files changed, 79 insertions(+), 23 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (8 preceding siblings ...)
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 0/2] virtio: one way barrier for packed vring flags Joyce Kong
@ 2019-09-17  5:28 ` Joyce Kong
  2019-10-14  7:42   ` Maxime Coquelin
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
  10 siblings, 1 reply; 30+ messages in thread
From: Joyce Kong @ 2019-09-17  5:28 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, yinan.wang, bruce.richardson, tiwei.bie,
	zhihong.wang, amorenoz, xiao.w.wang, yong.liu, jfreimann,
	honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
flags in packed ring.

Meanwhile, a read barrier is required to ensure ordering between
descriptor's flags and content reads[1]. With C11, load-acquire can
enforce the ordering instead of rmb barrier.

[1]https://patchwork.dpdk.org/patch/49109/

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
 drivers/net/virtio/virtqueue.h                   | 21 +++++++++++++++++++++
 lib/librte_vhost/vhost.h                         |  2 +-
 lib/librte_vhost/virtio_net.c                    | 11 +++++------
 5 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 27ead19..a87ffe1 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -456,8 +456,10 @@ virtqueue_enqueue_recv_refill_packed(struct virtqueue *vq,
 		vq->vq_desc_head_idx = dxp->next;
 		if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
 			vq->vq_desc_tail_idx = vq->vq_desc_head_idx;
-		virtio_wmb(hw->weak_barriers);
-		start_dp[idx].flags = flags;
+
+		virtqueue_store_flags_packed(&start_dp[idx], flags,
+					     hw->weak_barriers);
+
 		if (++vq->vq_avail_idx >= vq->vq_nentries) {
 			vq->vq_avail_idx -= vq->vq_nentries;
 			vq->vq_packed.cached_flags ^=
@@ -671,8 +673,7 @@ virtqueue_enqueue_xmit_packed_fast(struct virtnet_tx *txvq,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	dp->flags = flags;
+	virtqueue_store_flags_packed(dp, flags, vq->hw->weak_barriers);
 }
 
 static inline void
@@ -763,8 +764,8 @@ virtqueue_enqueue_xmit_packed(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 			vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
 	}
 
-	virtio_wmb(vq->hw->weak_barriers);
-	head_dp->flags = head_flags;
+	virtqueue_store_flags_packed(head_dp, head_flags,
+				     vq->hw->weak_barriers);
 }
 
 static inline void
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index fab87eb..7911c39 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -624,7 +624,7 @@ virtio_user_handle_ctrl_msg(struct virtio_user_dev *dev, struct vring *vring,
 static inline int
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = desc->flags;
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_PACKED_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_PACKED_DESC_F_USED);
@@ -684,6 +684,10 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 	struct vring_packed *vring = &dev->packed_vrings[queue_idx];
 	uint16_t n_descs, flags;
 
+	/* Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
+	 */
 	while (desc_is_avail(&vring->desc[vq->used_idx],
 			     vq->used_wrap_counter)) {
 
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index c6dd4a3..b728ff8 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -54,6 +54,27 @@ virtio_wmb(uint8_t weak_barriers)
 		rte_cio_wmb();
 }
 
+static inline void
+virtqueue_store_flags_packed(struct vring_packed_desc *dp,
+			      uint16_t flags, uint8_t weak_barriers)
+{
+	if (weak_barriers) {
+/* x86 prefers to using rte_smp_wmb over __atomic_store_n as it reports
+ * a better perf(~1.5%), which comes from the saved branch by the compiler.
+ * The if and else branch are identical with the smp and cio barriers both
+ * defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+		rte_smp_wmb();
+		dp->flags = flags;
+#else
+		__atomic_store_n(&dp->flags, flags, __ATOMIC_RELEASE);
+#endif
+	} else {
+		rte_cio_wmb();
+		dp->flags = flags;
+	}
+}
 #ifdef RTE_PMD_PACKET_PREFETCH
 #define rte_packet_prefetch(p)  rte_prefetch1(p)
 #else
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa..d294ed1 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -344,7 +344,7 @@ vq_is_packed(struct virtio_net *dev)
 static inline bool
 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
 {
-	uint16_t flags = *((volatile uint16_t *) &desc->flags);
+	uint16_t flags = __atomic_load_n(&desc->flags, __ATOMIC_ACQUIRE);
 
 	return wrap_counter == !!(flags & VRING_DESC_F_AVAIL) &&
 		wrap_counter != !!(flags & VRING_DESC_F_USED);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b83..e7463ff 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -503,14 +503,13 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (avail_idx < vq->last_avail_idx)
 		wrap_counter ^= 1;
 
-	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
-		return -1;
-
 	/*
-	 * The ordering between desc flags and desc
-	 * content reads need to be enforced.
+	 * Perform a load-acquire barrier in desc_is_avail to
+	 * enforce the ordering between desc flags and desc
+	 * content.
 	 */
-	rte_smp_rmb();
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
+		return -1;
 
 	*desc_count = 0;
 	*len = 0;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags
  2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
                   ` (9 preceding siblings ...)
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-09-17  5:28 ` Joyce Kong
  2019-09-18  5:20   ` Wang, Yinan
  2019-10-14  7:43   ` Maxime Coquelin
  10 siblings, 2 replies; 30+ messages in thread
From: Joyce Kong @ 2019-09-17  5:28 UTC (permalink / raw)
  To: dev
  Cc: nd, maxime.coquelin, yinan.wang, bruce.richardson, tiwei.bie,
	zhihong.wang, amorenoz, xiao.w.wang, yong.liu, jfreimann,
	honnappa.nagarahalli, gavin.hu

In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
flags in packed ring.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++---
 drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
 drivers/net/virtio/virtqueue.h                   | 28 +++++++++++++++++++++++-
 lib/librte_vhost/virtio_net.c                    |  5 ++---
 4 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a87ffe1..2f0879c 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -122,9 +122,11 @@ virtqueue_dequeue_burst_rx_packed(struct virtqueue *vq,
 
 	for (i = 0; i < num; i++) {
 		used_idx = vq->vq_used_cons_idx;
+		/* desc_is_used has a load-acquire or rte_cio_rmb inside
+		 * and wait for used desc in virtqueue.
+		 */
 		if (!desc_is_used(&desc[used_idx], vq))
 			return i;
-		virtio_rmb(vq->hw->weak_barriers);
 		len[i] = desc[used_idx].len;
 		id = desc[used_idx].id;
 		cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
@@ -233,8 +235,10 @@ virtio_xmit_cleanup_inorder_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		do {
 			curr_id = used_idx;
@@ -265,8 +269,10 @@ virtio_xmit_cleanup_normal_packed(struct virtqueue *vq, int num)
 	struct vq_desc_extra *dxp;
 
 	used_idx = vq->vq_used_cons_idx;
+	/* desc_is_used has a load-acquire or rte_cio_rmb inside
+	 * and wait for used desc in virtqueue.
+	 */
 	while (num-- && desc_is_used(&desc[used_idx], vq)) {
-		virtio_rmb(vq->hw->weak_barriers);
 		id = desc[used_idx].id;
 		dxp = &vq->vq_descx[id];
 		vq->vq_used_cons_idx += dxp->ndescs;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 7911c39..1c575d0 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -698,8 +698,8 @@ virtio_user_handle_cq_packed(struct virtio_user_dev *dev, uint16_t queue_idx)
 		if (vq->used_wrap_counter)
 			flags |= VRING_PACKED_DESC_F_AVAIL_USED;
 
-		rte_smp_wmb();
-		vring->desc[vq->used_idx].flags = flags;
+		__atomic_store_n(&vring->desc[vq->used_idx].flags, flags,
+				 __ATOMIC_RELEASE);
 
 		vq->used_idx += n_descs;
 		if (vq->used_idx >= dev->queue_size) {
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index b728ff8..8d7f197 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -54,6 +54,32 @@ virtio_wmb(uint8_t weak_barriers)
 		rte_cio_wmb();
 }
 
+static inline uint16_t
+virtqueue_fetch_flags_packed(struct vring_packed_desc *dp,
+			      uint8_t weak_barriers)
+{
+	uint16_t flags;
+
+	if (weak_barriers) {
+/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
+ * a better perf(~1.5%), which comes from the saved branch by the compiler.
+ * The if and else branch are identical with the smp and cio barriers both
+ * defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+		flags = dp->flags;
+		rte_smp_rmb();
+#else
+		flags = __atomic_load_n(&dp->flags, __ATOMIC_ACQUIRE);
+#endif
+	} else {
+		flags = dp->flags;
+		rte_cio_rmb();
+	}
+
+	return flags;
+}
+
 static inline void
 virtqueue_store_flags_packed(struct vring_packed_desc *dp,
 			      uint16_t flags, uint8_t weak_barriers)
@@ -307,7 +333,7 @@ desc_is_used(struct vring_packed_desc *desc, struct virtqueue *vq)
 {
 	uint16_t used, avail, flags;
 
-	flags = desc->flags;
+	flags = virtqueue_fetch_flags_packed(desc, vq->hw->weak_barriers);
 	used = !!(flags & VRING_PACKED_DESC_F_USED);
 	avail = !!(flags & VRING_PACKED_DESC_F_AVAIL);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e7463ff..241d467 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -110,8 +110,6 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 			used_idx -= vq->size;
 	}
 
-	rte_smp_wmb();
-
 	for (i = 0; i < vq->shadow_used_idx; i++) {
 		uint16_t flags;
 
@@ -147,7 +145,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
 		}
 	}
 
-	vq->desc_packed[head_idx].flags = head_flags;
+	__atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags,
+			 __ATOMIC_RELEASE);
 
 	vhost_log_cache_used_vring(dev, vq,
 				head_idx *
-- 
2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
@ 2019-09-18  5:20   ` Wang, Yinan
  2019-09-19  4:04     ` Gavin Hu (Arm Technology China)
  2019-10-14  7:43   ` Maxime Coquelin
  1 sibling, 1 reply; 30+ messages in thread
From: Wang, Yinan @ 2019-09-18  5:20 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, maxime.coquelin, Richardson, Bruce, Bie, Tiwei, Wang,
	Zhihong, amorenoz, Wang, Xiao W, Liu, Yong, jfreimann,
	honnappa.nagarahalli, gavin.hu


Hi Joyce,

I test performance impact of your patch set with code base commit id: d03d8622db48918d14bfe805641b1766ecc40088, after applying your v4 patch set , packed ring shows small performance drop as below:

PVP vhost/virtio 1c1q test	       commit:d03d8622db48918	apply v4 patch set

pvp_virtio11_mergeable	                   7.218	           7.147
pvp_virtio11_normal	                   7.217	           7.182


Thanks,
Yinan




> -----Original Message-----
> From: Joyce Kong [mailto:joyce.kong@arm.com]
> Sent: 2019年9月17日 13:28
> To: dev@dpdk.org
> Cc: nd@arm.com; maxime.coquelin@redhat.com; Wang, Yinan
> <yinan.wang@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Bie,
> Tiwei <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>;
> amorenoz@redhat.com; Wang, Xiao W <xiao.w.wang@intel.com>; Liu, Yong
> <yong.liu@intel.com>; jfreimann@redhat.com; honnappa.nagarahalli@arm.com;
> gavin.hu@arm.com
> Subject: [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags
> 
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can run on
> identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and
> yields better performance.
> For the above case, this patch helps yielding even better performance by
> replacing the two-way barriers with C11 one-way barriers for used flags in packed
> ring.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++---
>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
>  drivers/net/virtio/virtqueue.h                   | 28
> +++++++++++++++++++++++-
>  lib/librte_vhost/virtio_net.c                    |  5 ++---
>  4 files changed, 40 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index a87ffe1..2f0879c 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -122,9 +122,11 @@ virtqueue_dequeue_burst_rx_packed(struct virtqueue
> *vq,
> 
>  	for (i = 0; i < num; i++) {
>  		used_idx = vq->vq_used_cons_idx;
> +		/* desc_is_used has a load-acquire or rte_cio_rmb inside
> +		 * and wait for used desc in virtqueue.
> +		 */
>  		if (!desc_is_used(&desc[used_idx], vq))
>  			return i;
> -		virtio_rmb(vq->hw->weak_barriers);
>  		len[i] = desc[used_idx].len;
>  		id = desc[used_idx].id;
>  		cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie; @@ -233,8
> +235,10 @@ virtio_xmit_cleanup_inorder_packed(struct virtqueue *vq, int num)
>  	struct vq_desc_extra *dxp;
> 
>  	used_idx = vq->vq_used_cons_idx;
> +	/* desc_is_used has a load-acquire or rte_cio_rmb inside
> +	 * and wait for used desc in virtqueue.
> +	 */
>  	while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
> -		virtio_rmb(vq->hw->weak_barriers);
>  		id = desc[used_idx].id;
>  		do {
>  			curr_id = used_idx;
> @@ -265,8 +269,10 @@ virtio_xmit_cleanup_normal_packed(struct virtqueue
> *vq, int num)
>  	struct vq_desc_extra *dxp;
> 
>  	used_idx = vq->vq_used_cons_idx;
> +	/* desc_is_used has a load-acquire or rte_cio_rmb inside
> +	 * and wait for used desc in virtqueue.
> +	 */
>  	while (num-- && desc_is_used(&desc[used_idx], vq)) {
> -		virtio_rmb(vq->hw->weak_barriers);
>  		id = desc[used_idx].id;
>  		dxp = &vq->vq_descx[id];
>  		vq->vq_used_cons_idx += dxp->ndescs;
> diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> index 7911c39..1c575d0 100644
> --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> @@ -698,8 +698,8 @@ virtio_user_handle_cq_packed(struct virtio_user_dev
> *dev, uint16_t queue_idx)
>  		if (vq->used_wrap_counter)
>  			flags |= VRING_PACKED_DESC_F_AVAIL_USED;
> 
> -		rte_smp_wmb();
> -		vring->desc[vq->used_idx].flags = flags;
> +		__atomic_store_n(&vring->desc[vq->used_idx].flags, flags,
> +				 __ATOMIC_RELEASE);
> 
>  		vq->used_idx += n_descs;
>  		if (vq->used_idx >= dev->queue_size) { diff --git
> a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index
> b728ff8..8d7f197 100644
> --- a/drivers/net/virtio/virtqueue.h
> +++ b/drivers/net/virtio/virtqueue.h
> @@ -54,6 +54,32 @@ virtio_wmb(uint8_t weak_barriers)
>  		rte_cio_wmb();
>  }
> 
> +static inline uint16_t
> +virtqueue_fetch_flags_packed(struct vring_packed_desc *dp,
> +			      uint8_t weak_barriers)
> +{
> +	uint16_t flags;
> +
> +	if (weak_barriers) {
> +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
> + * a better perf(~1.5%), which comes from the saved branch by the compiler.
> + * The if and else branch are identical with the smp and cio barriers
> +both
> + * defined as compiler barriers on x86.
> + */
> +#ifdef RTE_ARCH_X86_64
> +		flags = dp->flags;
> +		rte_smp_rmb();
> +#else
> +		flags = __atomic_load_n(&dp->flags, __ATOMIC_ACQUIRE); #endif
> +	} else {
> +		flags = dp->flags;
> +		rte_cio_rmb();
> +	}
> +
> +	return flags;
> +}
> +
>  static inline void
>  virtqueue_store_flags_packed(struct vring_packed_desc *dp,
>  			      uint16_t flags, uint8_t weak_barriers) @@ -307,7 +333,7
> @@ desc_is_used(struct vring_packed_desc *desc, struct virtqueue *vq)  {
>  	uint16_t used, avail, flags;
> 
> -	flags = desc->flags;
> +	flags = virtqueue_fetch_flags_packed(desc, vq->hw->weak_barriers);
>  	used = !!(flags & VRING_PACKED_DESC_F_USED);
>  	avail = !!(flags & VRING_PACKED_DESC_F_AVAIL);
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index
> e7463ff..241d467 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -110,8 +110,6 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>  			used_idx -= vq->size;
>  	}
> 
> -	rte_smp_wmb();
> -
>  	for (i = 0; i < vq->shadow_used_idx; i++) {
>  		uint16_t flags;
> 
> @@ -147,7 +145,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>  		}
>  	}
> 
> -	vq->desc_packed[head_idx].flags = head_flags;
> +	__atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags,
> +			 __ATOMIC_RELEASE);
> 
>  	vhost_log_cache_used_vring(dev, vq,
>  				head_idx *
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags
  2019-09-18  5:20   ` Wang, Yinan
@ 2019-09-19  4:04     ` Gavin Hu (Arm Technology China)
  0 siblings, 0 replies; 30+ messages in thread
From: Gavin Hu (Arm Technology China) @ 2019-09-19  4:04 UTC (permalink / raw)
  To: Wang, Yinan, Joyce Kong (Arm Technology China), dev
  Cc: nd, maxime.coquelin, Richardson, Bruce, Bie, Tiwei, Wang,
	Zhihong, amorenoz, Wang, Xiao W, Liu, Yong, jfreimann,
	Honnappa Nagarahalli

Hi Yinan,

Thanks for verification and reporting this, I will dump the assembly code and make an analysis after I am back from travel.
From my understanding the assembly code for x86 should be same, but I am not sure if anything we were missing. 

Is this <1 perf drop coming from run to run variances, or system noise? 

/Gavin

> -----Original Message-----
> From: Wang, Yinan <yinan.wang@intel.com>
> Sent: Wednesday, September 18, 2019 7:21 AM
> To: Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>;
> dev@dpdk.org
> Cc: nd <nd@arm.com>; maxime.coquelin@redhat.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang,
> Zhihong <zhihong.wang@intel.com>; amorenoz@redhat.com; Wang, Xiao W
> <xiao.w.wang@intel.com>; Liu, Yong <yong.liu@intel.com>;
> jfreimann@redhat.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: RE: [PATCH v4 2/2] virtio: one way barrier for packed vring desc used
> flags
> 
> 
> Hi Joyce,
> 
> I test performance impact of your patch set with code base commit id:
> d03d8622db48918d14bfe805641b1766ecc40088, after applying your v4 patch
> set , packed ring shows small performance drop as below:
> 
> PVP vhost/virtio 1c1q test	       commit:d03d8622db48918	apply v4
> patch set
> 
> pvp_virtio11_mergeable	                   7.218	           7.147
> pvp_virtio11_normal	                   7.217	           7.182
> 
> 
> Thanks,
> Yinan
> 
> 
> 
> 
> > -----Original Message-----
> > From: Joyce Kong [mailto:joyce.kong@arm.com]
> > Sent: 2019年9月17日 13:28
> > To: dev@dpdk.org
> > Cc: nd@arm.com; maxime.coquelin@redhat.com; Wang, Yinan
> > <yinan.wang@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> Bie,
> > Tiwei <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>;
> > amorenoz@redhat.com; Wang, Xiao W <xiao.w.wang@intel.com>; Liu, Yong
> > <yong.liu@intel.com>; jfreimann@redhat.com;
> honnappa.nagarahalli@arm.com;
> > gavin.hu@arm.com
> > Subject: [PATCH v4 2/2] virtio: one way barrier for packed vring desc used
> flags
> >
> > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> frontend
> > and backend are assumed to be implemented in software, that is they can
> run on
> > identical CPUs in an SMP configuration.
> > Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and
> > yields better performance.
> > For the above case, this patch helps yielding even better performance by
> > replacing the two-way barriers with C11 one-way barriers for used flags in
> packed
> > ring.
> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > ---
> >  drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++---
> >  drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
> >  drivers/net/virtio/virtqueue.h                   | 28
> > +++++++++++++++++++++++-
> >  lib/librte_vhost/virtio_net.c                    |  5 ++---
> >  4 files changed, 40 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> > index a87ffe1..2f0879c 100644
> > --- a/drivers/net/virtio/virtio_rxtx.c
> > +++ b/drivers/net/virtio/virtio_rxtx.c
> > @@ -122,9 +122,11 @@ virtqueue_dequeue_burst_rx_packed(struct
> virtqueue
> > *vq,
> >
> >  	for (i = 0; i < num; i++) {
> >  		used_idx = vq->vq_used_cons_idx;
> > +		/* desc_is_used has a load-acquire or rte_cio_rmb inside
> > +		 * and wait for used desc in virtqueue.
> > +		 */
> >  		if (!desc_is_used(&desc[used_idx], vq))
> >  			return i;
> > -		virtio_rmb(vq->hw->weak_barriers);
> >  		len[i] = desc[used_idx].len;
> >  		id = desc[used_idx].id;
> >  		cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie; @@ -
> 233,8
> > +235,10 @@ virtio_xmit_cleanup_inorder_packed(struct virtqueue *vq, int
> num)
> >  	struct vq_desc_extra *dxp;
> >
> >  	used_idx = vq->vq_used_cons_idx;
> > +	/* desc_is_used has a load-acquire or rte_cio_rmb inside
> > +	 * and wait for used desc in virtqueue.
> > +	 */
> >  	while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
> > -		virtio_rmb(vq->hw->weak_barriers);
> >  		id = desc[used_idx].id;
> >  		do {
> >  			curr_id = used_idx;
> > @@ -265,8 +269,10 @@ virtio_xmit_cleanup_normal_packed(struct
> virtqueue
> > *vq, int num)
> >  	struct vq_desc_extra *dxp;
> >
> >  	used_idx = vq->vq_used_cons_idx;
> > +	/* desc_is_used has a load-acquire or rte_cio_rmb inside
> > +	 * and wait for used desc in virtqueue.
> > +	 */
> >  	while (num-- && desc_is_used(&desc[used_idx], vq)) {
> > -		virtio_rmb(vq->hw->weak_barriers);
> >  		id = desc[used_idx].id;
> >  		dxp = &vq->vq_descx[id];
> >  		vq->vq_used_cons_idx += dxp->ndescs;
> > diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> > b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> > index 7911c39..1c575d0 100644
> > --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> > +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> > @@ -698,8 +698,8 @@ virtio_user_handle_cq_packed(struct
> virtio_user_dev
> > *dev, uint16_t queue_idx)
> >  		if (vq->used_wrap_counter)
> >  			flags |= VRING_PACKED_DESC_F_AVAIL_USED;
> >
> > -		rte_smp_wmb();
> > -		vring->desc[vq->used_idx].flags = flags;
> > +		__atomic_store_n(&vring->desc[vq->used_idx].flags, flags,
> > +				 __ATOMIC_RELEASE);
> >
> >  		vq->used_idx += n_descs;
> >  		if (vq->used_idx >= dev->queue_size) { diff --git
> > a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index
> > b728ff8..8d7f197 100644
> > --- a/drivers/net/virtio/virtqueue.h
> > +++ b/drivers/net/virtio/virtqueue.h
> > @@ -54,6 +54,32 @@ virtio_wmb(uint8_t weak_barriers)
> >  		rte_cio_wmb();
> >  }
> >
> > +static inline uint16_t
> > +virtqueue_fetch_flags_packed(struct vring_packed_desc *dp,
> > +			      uint8_t weak_barriers)
> > +{
> > +	uint16_t flags;
> > +
> > +	if (weak_barriers) {
> > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
> > + * a better perf(~1.5%), which comes from the saved branch by the
> compiler.
> > + * The if and else branch are identical with the smp and cio barriers
> > +both
> > + * defined as compiler barriers on x86.
> > + */
> > +#ifdef RTE_ARCH_X86_64
> > +		flags = dp->flags;
> > +		rte_smp_rmb();
> > +#else
> > +		flags = __atomic_load_n(&dp->flags, __ATOMIC_ACQUIRE);
> #endif
> > +	} else {
> > +		flags = dp->flags;
> > +		rte_cio_rmb();
> > +	}
> > +
> > +	return flags;
> > +}
> > +
> >  static inline void
> >  virtqueue_store_flags_packed(struct vring_packed_desc *dp,
> >  			      uint16_t flags, uint8_t weak_barriers) @@ -307,7
> +333,7
> > @@ desc_is_used(struct vring_packed_desc *desc, struct virtqueue *vq)  {
> >  	uint16_t used, avail, flags;
> >
> > -	flags = desc->flags;
> > +	flags = virtqueue_fetch_flags_packed(desc, vq->hw->weak_barriers);
> >  	used = !!(flags & VRING_PACKED_DESC_F_USED);
> >  	avail = !!(flags & VRING_PACKED_DESC_F_AVAIL);
> >
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index
> > e7463ff..241d467 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -110,8 +110,6 @@ flush_shadow_used_ring_packed(struct virtio_net
> *dev,
> >  			used_idx -= vq->size;
> >  	}
> >
> > -	rte_smp_wmb();
> > -
> >  	for (i = 0; i < vq->shadow_used_idx; i++) {
> >  		uint16_t flags;
> >
> > @@ -147,7 +145,8 @@ flush_shadow_used_ring_packed(struct virtio_net
> *dev,
> >  		}
> >  	}
> >
> > -	vq->desc_packed[head_idx].flags = head_flags;
> > +	__atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags,
> > +			 __ATOMIC_RELEASE);
> >
> >  	vhost_log_cache_used_vring(dev, vq,
> >  				head_idx *
> > --
> > 2.7.4


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for packed vring desc avail flags
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
@ 2019-10-14  7:42   ` Maxime Coquelin
  0 siblings, 0 replies; 30+ messages in thread
From: Maxime Coquelin @ 2019-10-14  7:42 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, yinan.wang, bruce.richardson, tiwei.bie, zhihong.wang,
	amorenoz, xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli,
	gavin.hu



On 9/17/19 7:28 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for avail
> flags in packed ring.
> 
> Meanwhile, a read barrier is required to ensure ordering between
> descriptor's flags and content reads[1]. With C11, load-acquire can
> enforce the ordering instead of rmb barrier.
> 
> [1]https://patchwork.dpdk.org/patch/49109/
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  drivers/net/virtio/virtio_rxtx.c                 | 13 +++++++------
>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  6 +++++-
>  drivers/net/virtio/virtqueue.h                   | 21 +++++++++++++++++++++
>  lib/librte_vhost/vhost.h                         |  2 +-
>  lib/librte_vhost/virtio_net.c                    | 11 +++++------
>  5 files changed, 39 insertions(+), 14 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
  2019-09-18  5:20   ` Wang, Yinan
@ 2019-10-14  7:43   ` Maxime Coquelin
  1 sibling, 0 replies; 30+ messages in thread
From: Maxime Coquelin @ 2019-10-14  7:43 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, yinan.wang, bruce.richardson, tiwei.bie, zhihong.wang,
	amorenoz, xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli,
	gavin.hu



On 9/17/19 7:28 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for used
> flags in packed ring.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  drivers/net/virtio/virtio_rxtx.c                 | 12 +++++++---
>  drivers/net/virtio/virtio_user/virtio_user_dev.c |  4 ++--
>  drivers/net/virtio/virtqueue.h                   | 28 +++++++++++++++++++++++-
>  lib/librte_vhost/virtio_net.c                    |  5 ++---
>  4 files changed, 40 insertions(+), 9 deletions(-)

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v4 0/2] virtio: one way barrier for packed vring flags
  2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 0/2] virtio: one way barrier for packed vring flags Joyce Kong
@ 2019-10-16 11:07   ` Maxime Coquelin
  0 siblings, 0 replies; 30+ messages in thread
From: Maxime Coquelin @ 2019-10-16 11:07 UTC (permalink / raw)
  To: Joyce Kong, dev
  Cc: nd, yinan.wang, bruce.richardson, tiwei.bie, zhihong.wang,
	amorenoz, xiao.w.wang, yong.liu, jfreimann, honnappa.nagarahalli,
	gavin.hu



On 9/17/19 7:28 AM, Joyce Kong wrote:
> This patch set replaces the two-way barriers with C11 one-way barriers
> for packed vring flags, when the frontend and backend are implemented
> in software.
> 
> By doing vhost-user + virtio-user case benchmarking, 9% performance gain
> in the RFC2544 test was measured on Thunderx2 platform.[1] And by doing
> VM2VM case benchmarking, 11% perf gain was measured on Ampere platform.
> 
> [1]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
>    PVP test with virtio 1.1 mergeable path
> 
> v4:
> Use rte_smp_rmb/wmb instead of __atomic_load/store_n on x86 as it reports a
> better perf(~1.5%), which comes from the saved branch by the compiler. The
> if and else branch are identical with the smp and cio barriers both defined
> as compiler barriers on x86.
> http://inbox.dpdk.org/dev/E0CBA5A1980F1F408E1F28F9991B5B1D50EFF246@SHSMSX104.ccr.corp.intel.com/
> 
> v3:
> Wrap C11 one-way barriers and DMA barriers(rte_cio_*) together with an inline fuction.
> 
> v2:
> Convert RFC to patch.
> 
> Joyce Kong (2):
>   virtio: one way barrier for packed vring desc avail flags
>   virtio: one way barrier for packed vring desc used flags
> 
>  drivers/net/virtio/virtio_rxtx.c                 | 25 +++++++-----
>  drivers/net/virtio/virtio_user/virtio_user_dev.c | 10 +++--
>  drivers/net/virtio/virtqueue.h                   | 49 +++++++++++++++++++++++-
>  lib/librte_vhost/vhost.h                         |  2 +-
>  lib/librte_vhost/virtio_net.c                    | 16 ++++----
>  5 files changed, 79 insertions(+), 23 deletions(-)
> 

Applied to dpdk-next-virtio/master.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2019-10-16 11:08 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-27  8:19 [dpdk-dev] [RFC PATCH 0/2] virtio: one way barrier for packed vring flags Joyce Kong
2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
2019-08-27  8:19 ` [dpdk-dev] [RFC PATCH 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 0/2] virtio: one way barrier for packed vring flags Joyce Kong
2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
2019-09-06 16:01   ` Maxime Coquelin
2019-09-09  9:24     ` Joyce Kong (Arm Technology China)
2019-09-06 11:34 ` [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 0/2] virtio: one way barrier for packed vring flags Joyce Kong
2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
2019-09-09 10:10   ` Maxime Coquelin
2019-09-10  3:54     ` Wang, Yinan
2019-09-10  9:48       ` Gavin Hu (Arm Technology China)
2019-09-10 10:17         ` Maxime Coquelin
2019-09-11  2:39         ` Liu, Yong
2019-09-11  3:35           ` Gavin Hu (Arm Technology China)
2019-09-11  6:29             ` Liu, Yong
2019-09-11  8:32               ` Gavin Hu (Arm Technology China)
2019-09-11 10:02                 ` Bruce Richardson
2019-09-12  8:21                   ` Gavin Hu (Arm Technology China)
2019-09-09  9:14 ` [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
2019-09-09 10:11   ` Maxime Coquelin
2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 0/2] virtio: one way barrier for packed vring flags Joyce Kong
2019-10-16 11:07   ` Maxime Coquelin
2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for packed vring desc avail flags Joyce Kong
2019-10-14  7:42   ` Maxime Coquelin
2019-09-17  5:28 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for packed vring desc used flags Joyce Kong
2019-09-18  5:20   ` Wang, Yinan
2019-09-19  4:04     ` Gavin Hu (Arm Technology China)
2019-10-14  7:43   ` Maxime Coquelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).