* [dpdk-dev] [PATCH v1 1/2] virtio: one way barrier for split vring used idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-02-12 9:24 ` Joyce Kong
2020-02-12 9:24 ` [dpdk-dev] [PATCH v1 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
` (12 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-02-12 9:24 UTC (permalink / raw)
To: dev
Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, honnappa.nagarahalli, gavin.hu
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtio_ethdev.c | 9 ++--
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 34 +++++++++++---
lib/librte_vhost/virtio_net.c | 5 +-
9 files changed, 68 insertions(+), 48 deletions(-)
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index f9d0ea70d..a4a865bfa 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
virtqueue_notify(vq);
- rte_rmb();
- while (VIRTQUEUE_NUSED(vq) == 0) {
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ while (virtqueue_nused(vq) == 0)
usleep(100);
- }
- while (VIRTQUEUE_NUSED(vq)) {
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ while (virtqueue_nused(vq)) {
uint32_t idx, desc_idx, used_idx;
struct vring_used_elem *uep;
diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
index 7ba34662e..0f6574f68 100644
--- a/drivers/net/virtio/virtio_ring.h
+++ b/drivers/net/virtio/virtio_ring.h
@@ -59,7 +59,7 @@ struct vring_used_elem {
struct vring_used {
uint16_t flags;
- volatile uint16_t idx;
+ uint16_t idx;
struct vring_used_elem ring[0];
};
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 752faa0f6..9ba26fd95 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
struct virtnet_rx *rxvq = rxq;
struct virtqueue *vq = rxvq->vq;
- return VIRTQUEUE_NUSED(vq) >= offset;
+ return virtqueue_nused(vq) >= offset;
}
void
@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
@@ -1458,12 +1457,11 @@ virtio_recv_pkts_inorder(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
nb_used = RTE_MIN(nb_used, nb_pkts);
nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ);
- virtio_rmb(hw->weak_barriers);
-
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
nb_enqueued = 0;
@@ -1552,8 +1550,8 @@ virtio_recv_pkts_inorder(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -1644,9 +1642,8 @@ virtio_recv_mergeable_pkts(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
@@ -1734,8 +1731,8 @@ virtio_recv_mergeable_pkts(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -2108,9 +2105,10 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
return nb_pkts;
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
+
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup(vq, nb_used);
@@ -2142,8 +2140,11 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
/* Positive value indicates it need free vring descriptors */
if (unlikely(need > 0)) {
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or
+ * rte_cio_rmb inside
+ */
+ nb_used = virtqueue_nused(vq);
+
need = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup(vq, need);
@@ -2180,11 +2181,10 @@ static __rte_always_inline int
virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need)
{
uint16_t nb_used, nb_clean, nb_descs;
- struct virtio_hw *hw = vq->hw;
nb_descs = vq->vq_free_cnt + need;
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
nb_clean = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup_inorder(vq, nb_clean);
@@ -2213,9 +2213,9 @@ virtio_xmit_pkts_inorder(void *tx_queue,
VIRTQUEUE_DUMP(vq);
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
- virtio_rmb(hw->weak_barriers);
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup_inorder(vq, nb_used);
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
index 992e71f01..363e2b330 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_neon.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
index f9ec4ae69..45a45e6f4 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_sse.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -85,9 +85,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 1c6b26f8d..7fb135f49 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
struct vring *vring = &dev->vrings[queue_idx];
/* Consume avail ring, using used ring idx as first one */
- while (vring->used->idx != vring->avail->idx) {
- avail_idx = (vring->used->idx) & (vring->num - 1);
+ while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ != vring->avail->idx) {
+ avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ & (vring->num - 1);
desc_idx = vring->avail->ring[avail_idx];
n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx);
@@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
uep->id = desc_idx;
uep->len = n_descs;
- vring->used->idx++;
+ __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED);
}
}
diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c
index 0b4e3bf3e..b0f61dabc 100644
--- a/drivers/net/virtio/virtqueue.c
+++ b/drivers/net/virtio/virtqueue.c
@@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq)
uint16_t used_idx, desc_idx;
uint16_t nb_used, i;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
for (i = 0; i < nb_used; i++) {
used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1);
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 58ad7309a..13fdcb13a 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -464,8 +464,29 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
return VTNET_TQ;
}
-#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
- (vq)->vq_used_cons_idx))
+static inline uint16_t
+virtqueue_nused(struct virtqueue *vq)
+{
+ uint16_t idx;
+ if (vq->hw->weak_barriers) {
+/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
+ * a slightly better perf, which comes from the saved branch by the compiler.
+ * The if and else branches are identical with the smp and cio barriers both
+ * defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ idx = vq->vq_split.ring.used->idx;
+ rte_smp_rmb();
+#else
+ idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
+ __ATOMIC_ACQUIRE);
+#endif
+ } else {
+ idx = vq->vq_split.ring.used->idx;
+ rte_cio_rmb();
+ }
+ return (idx - vq->vq_used_cons_idx);
+}
void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx);
void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx);
@@ -534,7 +555,8 @@ virtqueue_notify(struct virtqueue *vq)
#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
#define VIRTQUEUE_DUMP(vq) do { \
uint16_t used_idx, nused; \
- used_idx = (vq)->vq_split.ring.used->idx; \
+ used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \
+ __ATOMIC_RELAXED);
nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
if (vtpci_packed_queue((vq)->hw)) { \
PMD_INIT_LOG(DEBUG, \
@@ -549,9 +571,9 @@ virtqueue_notify(struct virtqueue *vq)
"VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
" avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
" avail.flags=0x%x; used.flags=0x%x", \
- (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
- (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \
- (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \
+ (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \
+ (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \
+ __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \
(vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \
} while (0)
#else
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 37c47c7dc..7f6e7f2c1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
}
vq->last_used_idx += vq->shadow_used_idx;
- rte_smp_wmb();
-
vhost_log_cache_sync(dev, vq);
- *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+ __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
+ __ATOMIC_RELEASE);
vq->shadow_used_idx = 0;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v1 2/2] virtio: one way barrier for split vring avail idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
2020-02-12 9:24 ` [dpdk-dev] [PATCH v1 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-02-12 9:24 ` Joyce Kong
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 0/2] one way barrier for split vring idx Joyce Kong
` (11 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-02-12 9:24 UTC (permalink / raw)
To: dev
Cc: nd, maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, honnappa.nagarahalli, gavin.hu
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtqueue.h | 19 +++++++++++++++++--
lib/librte_vhost/virtio_net.c | 14 +++++---------
2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 13fdcb13a..bbe36c107 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -496,8 +496,23 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx,
static inline void
vq_update_avail_idx(struct virtqueue *vq)
{
- virtio_wmb(vq->hw->weak_barriers);
- vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ if (vq->hw->weak_barriers) {
+/* x86 prefers to using rte_smp_wmb over __atomic_store_n as it reports
+ * a slightly better perf, which comes from the saved branch by the compiler.
+ * The if and else branches are identical with the smp and cio barriers both
+ * defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ rte_smp_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+#else
+ __atomic_store_n(&vq->vq_split.ring.avail->idx,
+ vq->vq_avail_idx, __ATOMIC_RELEASE);
+#endif
+ } else {
+ rte_cio_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ }
}
static inline void
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 7f6e7f2c1..4c5380bc1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
- avail_head = *((volatile uint16_t *)&vq->avail->idx);
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- free_entries = *((volatile uint16_t *)&vq->avail->idx) -
- vq->last_avail_idx;
- if (free_entries == 0)
- return 0;
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ return 0;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v2 0/2] one way barrier for split vring idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
2020-02-12 9:24 ` [dpdk-dev] [PATCH v1 1/2] virtio: one way barrier for split vring used idx Joyce Kong
2020-02-12 9:24 ` [dpdk-dev] [PATCH v1 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
@ 2020-04-02 2:57 ` Joyce Kong
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx Joyce Kong
` (10 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-02 2:57 UTC (permalink / raw)
To: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
This patch set replaces the two-way barriers with C11 one-way barriers
for split vring idx, when the frontend and backend are implemented
in software.
By doing PVP benchmarking, the test result showed the throughput increased
20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1]
By doing vhost-user + virtio-user case benchmarking, 4% performance gain
in the RFC2544 test of 0.001% of acceptable loss rate was measured on
Thunderx2 platform.[2]
[1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
[2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
PVP test with virtio 1.0 normal path
v2:
Add test performance statistics.
Joyce Kong (2):
virtio: one way barrier for split vring used idx
virtio: one way barrier for split vring avail idx
drivers/net/virtio/virtio_ethdev.c | 9 ++--
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 46 ++++++++--------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 +--
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 53 ++++++++++++++++---
lib/librte_vhost/virtio_net.c | 19 +++----
9 files changed, 90 insertions(+), 59 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (2 preceding siblings ...)
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-02 2:57 ` Joyce Kong
2020-04-02 15:47 ` Stephen Hemminger
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
` (9 subsequent siblings)
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-02 2:57 UTC (permalink / raw)
To: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtio_ethdev.c | 9 ++--
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 34 +++++++++++---
lib/librte_vhost/virtio_net.c | 5 +-
9 files changed, 68 insertions(+), 48 deletions(-)
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index f9d0ea70d..a4a865bfa 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
virtqueue_notify(vq);
- rte_rmb();
- while (VIRTQUEUE_NUSED(vq) == 0) {
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ while (virtqueue_nused(vq) == 0)
usleep(100);
- }
- while (VIRTQUEUE_NUSED(vq)) {
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ while (virtqueue_nused(vq)) {
uint32_t idx, desc_idx, used_idx;
struct vring_used_elem *uep;
diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
index 7ba34662e..0f6574f68 100644
--- a/drivers/net/virtio/virtio_ring.h
+++ b/drivers/net/virtio/virtio_ring.h
@@ -59,7 +59,7 @@ struct vring_used_elem {
struct vring_used {
uint16_t flags;
- volatile uint16_t idx;
+ uint16_t idx;
struct vring_used_elem ring[0];
};
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 752faa0f6..9ba26fd95 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
struct virtnet_rx *rxvq = rxq;
struct virtqueue *vq = rxvq->vq;
- return VIRTQUEUE_NUSED(vq) >= offset;
+ return virtqueue_nused(vq) >= offset;
}
void
@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
@@ -1458,12 +1457,11 @@ virtio_recv_pkts_inorder(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
nb_used = RTE_MIN(nb_used, nb_pkts);
nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ);
- virtio_rmb(hw->weak_barriers);
-
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
nb_enqueued = 0;
@@ -1552,8 +1550,8 @@ virtio_recv_pkts_inorder(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -1644,9 +1642,8 @@ virtio_recv_mergeable_pkts(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
@@ -1734,8 +1731,8 @@ virtio_recv_mergeable_pkts(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -2108,9 +2105,10 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
return nb_pkts;
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
+
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup(vq, nb_used);
@@ -2142,8 +2140,11 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
/* Positive value indicates it need free vring descriptors */
if (unlikely(need > 0)) {
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or
+ * rte_cio_rmb inside
+ */
+ nb_used = virtqueue_nused(vq);
+
need = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup(vq, need);
@@ -2180,11 +2181,10 @@ static __rte_always_inline int
virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need)
{
uint16_t nb_used, nb_clean, nb_descs;
- struct virtio_hw *hw = vq->hw;
nb_descs = vq->vq_free_cnt + need;
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
nb_clean = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup_inorder(vq, nb_clean);
@@ -2213,9 +2213,9 @@ virtio_xmit_pkts_inorder(void *tx_queue,
VIRTQUEUE_DUMP(vq);
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
- virtio_rmb(hw->weak_barriers);
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup_inorder(vq, nb_used);
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
index 992e71f01..363e2b330 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_neon.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
index f9ec4ae69..45a45e6f4 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_sse.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -85,9 +85,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 1c6b26f8d..7fb135f49 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
struct vring *vring = &dev->vrings[queue_idx];
/* Consume avail ring, using used ring idx as first one */
- while (vring->used->idx != vring->avail->idx) {
- avail_idx = (vring->used->idx) & (vring->num - 1);
+ while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ != vring->avail->idx) {
+ avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ & (vring->num - 1);
desc_idx = vring->avail->ring[avail_idx];
n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx);
@@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
uep->id = desc_idx;
uep->len = n_descs;
- vring->used->idx++;
+ __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED);
}
}
diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c
index 0b4e3bf3e..b0f61dabc 100644
--- a/drivers/net/virtio/virtqueue.c
+++ b/drivers/net/virtio/virtqueue.c
@@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq)
uint16_t used_idx, desc_idx;
uint16_t nb_used, i;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
for (i = 0; i < nb_used; i++) {
used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1);
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 58ad7309a..13fdcb13a 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -464,8 +464,29 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
return VTNET_TQ;
}
-#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
- (vq)->vq_used_cons_idx))
+static inline uint16_t
+virtqueue_nused(struct virtqueue *vq)
+{
+ uint16_t idx;
+ if (vq->hw->weak_barriers) {
+/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
+ * a slightly better perf, which comes from the saved branch by the compiler.
+ * The if and else branches are identical with the smp and cio barriers both
+ * defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ idx = vq->vq_split.ring.used->idx;
+ rte_smp_rmb();
+#else
+ idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
+ __ATOMIC_ACQUIRE);
+#endif
+ } else {
+ idx = vq->vq_split.ring.used->idx;
+ rte_cio_rmb();
+ }
+ return (idx - vq->vq_used_cons_idx);
+}
void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx);
void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx);
@@ -534,7 +555,8 @@ virtqueue_notify(struct virtqueue *vq)
#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
#define VIRTQUEUE_DUMP(vq) do { \
uint16_t used_idx, nused; \
- used_idx = (vq)->vq_split.ring.used->idx; \
+ used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \
+ __ATOMIC_RELAXED);
nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
if (vtpci_packed_queue((vq)->hw)) { \
PMD_INIT_LOG(DEBUG, \
@@ -549,9 +571,9 @@ virtqueue_notify(struct virtqueue *vq)
"VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
" avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
" avail.flags=0x%x; used.flags=0x%x", \
- (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
- (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \
- (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \
+ (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \
+ (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \
+ __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \
(vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \
} while (0)
#else
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 37c47c7dc..7f6e7f2c1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
}
vq->last_used_idx += vq->shadow_used_idx;
- rte_smp_wmb();
-
vhost_log_cache_sync(dev, vq);
- *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+ __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
+ __ATOMIC_RELEASE);
vq->shadow_used_idx = 0;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-02 15:47 ` Stephen Hemminger
2020-04-03 8:55 ` Gavin Hu
0 siblings, 1 reply; 32+ messages in thread
From: Stephen Hemminger @ 2020-04-02 15:47 UTC (permalink / raw)
To: Joyce Kong
Cc: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, honnappa.nagarahalli, gavin.hu, nd, dev
On Thu, 2 Apr 2020 10:57:52 +0800
Joyce Kong <joyce.kong@arm.com> wrote:
> - (vq)->vq_used_cons_idx))
> +static inline uint16_t
> +virtqueue_nused(struct virtqueue *vq)
vq is unmodified and should be const
> +{
> + uint16_t idx;
> + if (vq->hw->weak_barriers) {
Put blank line between declaration and if statement
> +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
> + * a slightly better perf, which comes from the saved branch by the compiler.
> + * The if and else branches are identical with the smp and cio barriers both
> + * defined as compiler barriers on x86.
> + */
Do not put comments on left margin (except in function prolog).
> +#ifdef RTE_ARCH_X86_64
> + idx = vq->vq_split.ring.used->idx;
> + rte_smp_rmb();
> +#else
> + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
> + __ATOMIC_ACQUIRE);
> +#endif
> + } else {
> + idx = vq->vq_split.ring.used->idx;
> + rte_cio_rmb();
> + }
> + return (idx - vq->vq_used_cons_idx);
Parenthesis around arguments to return are unnecessary.
BSD code likes it, Linux style does not.
> +}
This kind of arch specific code is hard to maintain.
Does it really make that much difference.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx
2020-04-02 15:47 ` Stephen Hemminger
@ 2020-04-03 8:55 ` Gavin Hu
2020-04-16 4:40 ` Honnappa Nagarahalli
0 siblings, 1 reply; 32+ messages in thread
From: Gavin Hu @ 2020-04-03 8:55 UTC (permalink / raw)
To: Stephen Hemminger, Joyce Kong
Cc: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, Honnappa Nagarahalli, nd, dev, nd
Hi Stephen,
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Thursday, April 2, 2020 11:48 PM
> To: Joyce Kong <Joyce.Kong@arm.com>
> Cc: maxime.coquelin@redhat.com; tiwei.bie@intel.com;
> zhihong.wang@intel.com; thomas@monjalon.net; jerinj@marvell.com;
> yinan.wang@intel.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu <Gavin.Hu@arm.com>; nd
> <nd@arm.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring
> used idx
>
> On Thu, 2 Apr 2020 10:57:52 +0800
> Joyce Kong <joyce.kong@arm.com> wrote:
>
> > - (vq)->vq_used_cons_idx))
> > +static inline uint16_t
> > +virtqueue_nused(struct virtqueue *vq)
> vq is unmodified and should be const
>
> > +{
> > + uint16_t idx;
> > + if (vq->hw->weak_barriers) {
> Put blank line between declaration and if statement
Will fix in v3.
>
> > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports
> > + * a slightly better perf, which comes from the saved branch by the
> compiler.
> > + * The if and else branches are identical with the smp and cio barriers
> both
> > + * defined as compiler barriers on x86.
> > + */
>
> Do not put comments on left margin (except in function prolog).
Will fix in v3.
>
> > +#ifdef RTE_ARCH_X86_64
> > + idx = vq->vq_split.ring.used->idx;
> > + rte_smp_rmb();
> > +#else
> > + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
> > + __ATOMIC_ACQUIRE);
> > +#endif
> > + } else {
> > + idx = vq->vq_split.ring.used->idx;
> > + rte_cio_rmb();
> > + }
> > + return (idx - vq->vq_used_cons_idx);
>
> Parenthesis around arguments to return are unnecessary.
> BSD code likes it, Linux style does not.
Will fix in v3.
>
> > +}
>
> This kind of arch specific code is hard to maintain.
> Does it really make that much difference.
Yes, a stronger than required barrier is a performance killer, especially in the fast path.
The test was conducted on the ThunderX2+Intel XL710 testbed, the PVP test case.
/Gavin
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx
2020-04-03 8:55 ` Gavin Hu
@ 2020-04-16 4:40 ` Honnappa Nagarahalli
2020-04-16 6:46 ` Joyce Kong
0 siblings, 1 reply; 32+ messages in thread
From: Honnappa Nagarahalli @ 2020-04-16 4:40 UTC (permalink / raw)
To: Gavin Hu, Stephen Hemminger, Joyce Kong
Cc: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, nd, dev, Honnappa Nagarahalli, nd
<snip>
> > Subject: Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for
> > split vring used idx
> >
> > On Thu, 2 Apr 2020 10:57:52 +0800
> > Joyce Kong <joyce.kong@arm.com> wrote:
> >
> > > -(vq)->vq_used_cons_idx))
> > > +static inline uint16_t
> > > +virtqueue_nused(struct virtqueue *vq)
> > vq is unmodified and should be const
> >
> > > +{
> > > +uint16_t idx;
> > > +if (vq->hw->weak_barriers) {
> > Put blank line between declaration and if statement
> Will fix in v3.
> >
> > > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it
> > > +reports
> > > + * a slightly better perf, which comes from the saved branch by the
> > compiler.
> > > + * The if and else branches are identical with the smp and cio
> > > + barriers
> > both
> > > + * defined as compiler barriers on x86.
> > > + */
> >
> > Do not put comments on left margin (except in function prolog).
> Will fix in v3.
> >
> > > +#ifdef RTE_ARCH_X86_64
> > > +idx = vq->vq_split.ring.used->idx;
> > > +rte_smp_rmb();
> > > +#else
> > > +idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
> > > +__ATOMIC_ACQUIRE);
> > > +#endif
> > > +} else {
> > > +idx = vq->vq_split.ring.used->idx;
> > > +rte_cio_rmb();
> > > +}
> > > +return (idx - vq->vq_used_cons_idx);
> >
> > Parenthesis around arguments to return are unnecessary.
> > BSD code likes it, Linux style does not.
> Will fix in v3.
> >
> > > +}
> >
> > This kind of arch specific code is hard to maintain.
> > Does it really make that much difference.
> Yes, a stronger than required barrier is a performance killer, especially in the
> fast path.
> The test was conducted on the ThunderX2+Intel XL710 testbed, the PVP test
> case.
I think if the performance deference is not much we should stay with C11 built-ins for x86. How much is the performance difference on x86?
> /Gavin
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx
2020-04-16 4:40 ` Honnappa Nagarahalli
@ 2020-04-16 6:46 ` Joyce Kong
0 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-16 6:46 UTC (permalink / raw)
To: Honnappa Nagarahalli, Gavin Hu, Stephen Hemminger
Cc: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, nd, dev, nd
<snip>
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for
> > > split vring used idx
> > >
> > > On Thu, 2 Apr 2020 10:57:52 +0800
> > > Joyce Kong <joyce.kong@arm.com> wrote:
> > >
> > > > -(vq)->vq_used_cons_idx))
> > > > +static inline uint16_t
> > > > +virtqueue_nused(struct virtqueue *vq)
> > > vq is unmodified and should be const
> > >
> > > > +{
> > > > +uint16_t idx;
> > > > +if (vq->hw->weak_barriers) {
> > > Put blank line between declaration and if statement
> > Will fix in v3.
> > >
> > > > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it
> > > > +reports
> > > > + * a slightly better perf, which comes from the saved branch by
> > > > +the
> > > compiler.
> > > > + * The if and else branches are identical with the smp and cio
> > > > + barriers
> > > both
> > > > + * defined as compiler barriers on x86.
> > > > + */
> > >
> > > Do not put comments on left margin (except in function prolog).
> > Will fix in v3.
> > >
> > > > +#ifdef RTE_ARCH_X86_64
> > > > +idx = vq->vq_split.ring.used->idx; rte_smp_rmb(); #else idx =
> > > +__atomic_load_n(&(vq)->vq_split.ring.used->idx,
> > > > +__ATOMIC_ACQUIRE);
> > > > +#endif
> > > > +} else {
> > > > +idx = vq->vq_split.ring.used->idx; rte_cio_rmb(); } return (idx -
> > > > +vq->vq_used_cons_idx);
> > >
> > > Parenthesis around arguments to return are unnecessary.
> > > BSD code likes it, Linux style does not.
> > Will fix in v3.
> > >
> > > > +}
> > >
> > > This kind of arch specific code is hard to maintain.
> > > Does it really make that much difference.
> > Yes, a stronger than required barrier is a performance killer,
> > especially in the fast path.
> > The test was conducted on the ThunderX2+Intel XL710 testbed, the PVP
> > test case.
> > /Gavin
> I think if the performance deference is not much we should stay with C11 built-ins for x86. How much is the
> performance difference on x86?
According to intel's PVP test for packed ring, ~1.5% perf loss would be caused on x86 if staying with __atomic_load_n. So rte_smp_rmb is kept for x86 in packed ring patches which have been merged and the similar change is done in split ring patches.
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for split vring avail idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (3 preceding siblings ...)
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-02 2:57 ` Joyce Kong
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 0/2] one way barrier for split vring idx Joyce Kong
` (8 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-02 2:57 UTC (permalink / raw)
To: maxime.coquelin, tiwei.bie, zhihong.wang, thomas, jerinj,
yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtqueue.h | 19 +++++++++++++++++--
lib/librte_vhost/virtio_net.c | 14 +++++---------
2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 13fdcb13a..bbe36c107 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -496,8 +496,23 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx,
static inline void
vq_update_avail_idx(struct virtqueue *vq)
{
- virtio_wmb(vq->hw->weak_barriers);
- vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ if (vq->hw->weak_barriers) {
+/* x86 prefers to using rte_smp_wmb over __atomic_store_n as it reports
+ * a slightly better perf, which comes from the saved branch by the compiler.
+ * The if and else branches are identical with the smp and cio barriers both
+ * defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ rte_smp_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+#else
+ __atomic_store_n(&vq->vq_split.ring.avail->idx,
+ vq->vq_avail_idx, __ATOMIC_RELEASE);
+#endif
+ } else {
+ rte_cio_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ }
}
static inline void
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 7f6e7f2c1..4c5380bc1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
- avail_head = *((volatile uint16_t *)&vq->avail->idx);
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- free_entries = *((volatile uint16_t *)&vq->avail->idx) -
- vq->last_avail_idx;
- if (free_entries == 0)
- return 0;
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ return 0;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v3 0/2] one way barrier for split vring idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (4 preceding siblings ...)
2020-04-02 2:57 ` [dpdk-dev] [PATCH v2 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
@ 2020-04-06 15:26 ` Joyce Kong
2020-04-16 9:08 ` Ye Xiaolong
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring used idx Joyce Kong
` (7 subsequent siblings)
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-06 15:26 UTC (permalink / raw)
To: maxime.coquelin, stephen, tiwei.bie, zhihong.wang, thomas,
jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
This patch set replaces the two-way barriers with C11 one-way barriers
for split vring idx, when the frontend and backend are implemented
in software.
By doing PVP benchmarking, the test result showed the throughput increased
20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1]
By doing vhost-user + virtio-user case benchmarking, 4% performance gain
in the RFC2544 test of 0.001% of acceptable loss rate was measured on
Thunderx2 platform.[2]
[1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
[2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
PVP test with virtio 1.0 normal path
v3:
Modify some style error.
v2:
Add test performance statistics.
Joyce Kong (2):
virtio: one way barrier for split vring used idx
virtio: one way barrier for split vring avail idx
drivers/net/virtio/virtio_ethdev.c | 9 ++-
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 46 +++++++--------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 57 ++++++++++++++++---
lib/librte_vhost/virtio_net.c | 19 +++----
9 files changed, 94 insertions(+), 59 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/2] one way barrier for split vring idx
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-16 9:08 ` Ye Xiaolong
0 siblings, 0 replies; 32+ messages in thread
From: Ye Xiaolong @ 2020-04-16 9:08 UTC (permalink / raw)
To: Joyce Kong
Cc: maxime.coquelin, stephen, tiwei.bie, zhihong.wang, thomas,
jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu, nd, dev
Hi, Joyce
Thanks for the patch.
On 04/06, Joyce Kong wrote:
>This patch set replaces the two-way barriers with C11 one-way barriers
>for split vring idx, when the frontend and backend are implemented
>in software.
>
>By doing PVP benchmarking, the test result showed the throughput increased
>20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1]
>
>By doing vhost-user + virtio-user case benchmarking, 4% performance gain
>in the RFC2544 test of 0.001% of acceptable loss rate was measured on
>Thunderx2 platform.[2]
One quick question is do you have the performance number on IA platform since
the code touches the common part of the virtio PMD?
Thanks,
Xiaolong
>
>[1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
>[2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
>v3:
>Modify some style error.
>
>v2:
>Add test performance statistics.
>
>Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
> drivers/net/virtio/virtio_ethdev.c | 9 ++-
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 46 +++++++--------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 57 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 19 +++----
> 9 files changed, 94 insertions(+), 59 deletions(-)
>
>--
>2.17.1
>
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring used idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (5 preceding siblings ...)
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-06 15:26 ` Joyce Kong
2020-04-17 6:51 ` Ye Xiaolong
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
` (6 subsequent siblings)
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-06 15:26 UTC (permalink / raw)
To: maxime.coquelin, stephen, tiwei.bie, zhihong.wang, thomas,
jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtio_ethdev.c | 9 ++--
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 37 ++++++++++++---
lib/librte_vhost/virtio_net.c | 5 +-
9 files changed, 71 insertions(+), 48 deletions(-)
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index f9d0ea70d..a4a865bfa 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
virtqueue_notify(vq);
- rte_rmb();
- while (VIRTQUEUE_NUSED(vq) == 0) {
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ while (virtqueue_nused(vq) == 0)
usleep(100);
- }
- while (VIRTQUEUE_NUSED(vq)) {
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ while (virtqueue_nused(vq)) {
uint32_t idx, desc_idx, used_idx;
struct vring_used_elem *uep;
diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
index 7ba34662e..0f6574f68 100644
--- a/drivers/net/virtio/virtio_ring.h
+++ b/drivers/net/virtio/virtio_ring.h
@@ -59,7 +59,7 @@ struct vring_used_elem {
struct vring_used {
uint16_t flags;
- volatile uint16_t idx;
+ uint16_t idx;
struct vring_used_elem ring[0];
};
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 752faa0f6..9ba26fd95 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
struct virtnet_rx *rxvq = rxq;
struct virtqueue *vq = rxvq->vq;
- return VIRTQUEUE_NUSED(vq) >= offset;
+ return virtqueue_nused(vq) >= offset;
}
void
@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
@@ -1458,12 +1457,11 @@ virtio_recv_pkts_inorder(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
nb_used = RTE_MIN(nb_used, nb_pkts);
nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ);
- virtio_rmb(hw->weak_barriers);
-
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
nb_enqueued = 0;
@@ -1552,8 +1550,8 @@ virtio_recv_pkts_inorder(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -1644,9 +1642,8 @@ virtio_recv_mergeable_pkts(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
@@ -1734,8 +1731,8 @@ virtio_recv_mergeable_pkts(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -2108,9 +2105,10 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
return nb_pkts;
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
+
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup(vq, nb_used);
@@ -2142,8 +2140,11 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
/* Positive value indicates it need free vring descriptors */
if (unlikely(need > 0)) {
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or
+ * rte_cio_rmb inside
+ */
+ nb_used = virtqueue_nused(vq);
+
need = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup(vq, need);
@@ -2180,11 +2181,10 @@ static __rte_always_inline int
virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need)
{
uint16_t nb_used, nb_clean, nb_descs;
- struct virtio_hw *hw = vq->hw;
nb_descs = vq->vq_free_cnt + need;
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
nb_clean = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup_inorder(vq, nb_clean);
@@ -2213,9 +2213,9 @@ virtio_xmit_pkts_inorder(void *tx_queue,
VIRTQUEUE_DUMP(vq);
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
- virtio_rmb(hw->weak_barriers);
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup_inorder(vq, nb_used);
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
index 992e71f01..363e2b330 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_neon.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
index f9ec4ae69..45a45e6f4 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_sse.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -85,9 +85,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 1c6b26f8d..7fb135f49 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
struct vring *vring = &dev->vrings[queue_idx];
/* Consume avail ring, using used ring idx as first one */
- while (vring->used->idx != vring->avail->idx) {
- avail_idx = (vring->used->idx) & (vring->num - 1);
+ while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ != vring->avail->idx) {
+ avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ & (vring->num - 1);
desc_idx = vring->avail->ring[avail_idx];
n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx);
@@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
uep->id = desc_idx;
uep->len = n_descs;
- vring->used->idx++;
+ __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED);
}
}
diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c
index 0b4e3bf3e..b0f61dabc 100644
--- a/drivers/net/virtio/virtqueue.c
+++ b/drivers/net/virtio/virtqueue.c
@@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq)
uint16_t used_idx, desc_idx;
uint16_t nb_used, i;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
for (i = 0; i < nb_used; i++) {
used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1);
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 58ad7309a..54dc63c93 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -464,8 +464,32 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
return VTNET_TQ;
}
-#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
- (vq)->vq_used_cons_idx))
+static inline uint16_t
+virtqueue_nused(const struct virtqueue *vq)
+{
+ uint16_t idx;
+
+ if (vq->hw->weak_barriers) {
+ /**
+ * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
+ * reports a slightly better perf, which comes from the saved
+ * branch by the compiler.
+ * The if and else branches are identical with the smp and cio
+ * barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ idx = vq->vq_split.ring.used->idx;
+ rte_smp_rmb();
+#else
+ idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
+ __ATOMIC_ACQUIRE);
+#endif
+ } else {
+ idx = vq->vq_split.ring.used->idx;
+ rte_cio_rmb();
+ }
+ return idx - vq->vq_used_cons_idx;
+}
void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx);
void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx);
@@ -534,7 +558,8 @@ virtqueue_notify(struct virtqueue *vq)
#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
#define VIRTQUEUE_DUMP(vq) do { \
uint16_t used_idx, nused; \
- used_idx = (vq)->vq_split.ring.used->idx; \
+ used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \
+ __ATOMIC_RELAXED);
nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
if (vtpci_packed_queue((vq)->hw)) { \
PMD_INIT_LOG(DEBUG, \
@@ -549,9 +574,9 @@ virtqueue_notify(struct virtqueue *vq)
"VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
" avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
" avail.flags=0x%x; used.flags=0x%x", \
- (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
- (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \
- (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \
+ (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \
+ (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \
+ __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \
(vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \
} while (0)
#else
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 37c47c7dc..7f6e7f2c1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
}
vq->last_used_idx += vq->shadow_used_idx;
- rte_smp_wmb();
-
vhost_log_cache_sync(dev, vq);
- *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+ __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
+ __ATOMIC_RELEASE);
vq->shadow_used_idx = 0;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring used idx
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-17 6:51 ` Ye Xiaolong
2020-04-17 8:14 ` Joyce Kong
0 siblings, 1 reply; 32+ messages in thread
From: Ye Xiaolong @ 2020-04-17 6:51 UTC (permalink / raw)
To: Joyce Kong
Cc: maxime.coquelin, stephen, tiwei.bie, zhihong.wang, thomas,
jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu, nd, dev
On 04/06, Joyce Kong wrote:
>In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
>and backend are assumed to be implemented in software, that is they can
>run on identical CPUs in an SMP configuration.
>Thus a weak form of memory barriers like rte_smp_r/wmb, other than
>rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
>and yields better performance.
>For the above case, this patch helps yielding even better performance
>by replacing the two-way barriers with C11 one-way barriers for used
>index in split ring.
>
>Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>---
> drivers/net/virtio/virtio_ethdev.c | 9 ++--
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 37 ++++++++++++---
> lib/librte_vhost/virtio_net.c | 5 +-
> 9 files changed, 71 insertions(+), 48 deletions(-)
>
>diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
>index f9d0ea70d..a4a865bfa 100644
>--- a/drivers/net/virtio/virtio_ethdev.c
>+++ b/drivers/net/virtio/virtio_ethdev.c
>@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
>
> virtqueue_notify(vq);
>
>- rte_rmb();
>- while (VIRTQUEUE_NUSED(vq) == 0) {
>- rte_rmb();
>+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
>+ while (virtqueue_nused(vq) == 0)
> usleep(100);
>- }
>
>- while (VIRTQUEUE_NUSED(vq)) {
>+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
>+ while (virtqueue_nused(vq)) {
> uint32_t idx, desc_idx, used_idx;
> struct vring_used_elem *uep;
>
>diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
>index 7ba34662e..0f6574f68 100644
>--- a/drivers/net/virtio/virtio_ring.h
>+++ b/drivers/net/virtio/virtio_ring.h
>@@ -59,7 +59,7 @@ struct vring_used_elem {
>
> struct vring_used {
> uint16_t flags;
>- volatile uint16_t idx;
>+ uint16_t idx;
> struct vring_used_elem ring[0];
> };
>
>diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
>index 752faa0f6..9ba26fd95 100644
>--- a/drivers/net/virtio/virtio_rxtx.c
>+++ b/drivers/net/virtio/virtio_rxtx.c
>@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
> struct virtnet_rx *rxvq = rxq;
> struct virtqueue *vq = rxvq->vq;
>
>- return VIRTQUEUE_NUSED(vq) >= offset;
>+ return virtqueue_nused(vq) >= offset;
> }
>
> void
>@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> if (unlikely(hw->started == 0))
> return nb_rx;
>
>- nb_used = VIRTQUEUE_NUSED(vq);
>-
>- virtio_rmb(hw->weak_barriers);
>+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
Small nit, I don't think we need to add this comment to every occurrence of
virtqueue_nused, what about moving it to the definition of this function?
Thanks,
Xiaolong
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring used idx
2020-04-17 6:51 ` Ye Xiaolong
@ 2020-04-17 8:14 ` Joyce Kong
0 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-17 8:14 UTC (permalink / raw)
To: Ye Xiaolong
Cc: maxime.coquelin, stephen, tiwei.bie, zhihong.wang, thomas,
jerinj, yinan.wang, Honnappa Nagarahalli, Gavin Hu, nd, dev
> -----Original Message-----
> From: Ye Xiaolong <xiaolong.ye@intel.com>
> Sent: Friday, April 17, 2020 2:52 PM
> To: Joyce Kong <Joyce.Kong@arm.com>
> Cc: maxime.coquelin@redhat.com; stephen@networkplumber.org;
> tiwei.bie@intel.com; zhihong.wang@intel.com; thomas@monjalon.net;
> jerinj@marvell.com; yinan.wang@intel.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu <Gavin.Hu@arm.com>; nd
> <nd@arm.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring
> used idx
>
> On 04/06, Joyce Kong wrote:
> >In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> >frontend and backend are assumed to be implemented in software, that is
> >they can run on identical CPUs in an SMP configuration.
> >Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> >rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> >and yields better performance.
> >For the above case, this patch helps yielding even better performance
> >by replacing the two-way barriers with C11 one-way barriers for used
> >index in split ring.
> >
> >Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> >Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> >---
> > drivers/net/virtio/virtio_ethdev.c | 9 ++--
> > drivers/net/virtio/virtio_ring.h | 2 +-
> > drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
> > drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> > drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
> > .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
> > drivers/net/virtio/virtqueue.c | 2 +-
> > drivers/net/virtio/virtqueue.h | 37 ++++++++++++---
> > lib/librte_vhost/virtio_net.c | 5 +-
> > 9 files changed, 71 insertions(+), 48 deletions(-)
> >
> >diff --git a/drivers/net/virtio/virtio_ethdev.c
> >b/drivers/net/virtio/virtio_ethdev.c
> >index f9d0ea70d..a4a865bfa 100644
> >--- a/drivers/net/virtio/virtio_ethdev.c
> >+++ b/drivers/net/virtio/virtio_ethdev.c
> >@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl
> >*cvq,
> >
> > virtqueue_notify(vq);
> >
> >- rte_rmb();
> >- while (VIRTQUEUE_NUSED(vq) == 0) {
> >- rte_rmb();
> >+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
> >+ while (virtqueue_nused(vq) == 0)
> > usleep(100);
> >- }
> >
> >- while (VIRTQUEUE_NUSED(vq)) {
> >+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
> >+ while (virtqueue_nused(vq)) {
> > uint32_t idx, desc_idx, used_idx;
> > struct vring_used_elem *uep;
> >
> >diff --git a/drivers/net/virtio/virtio_ring.h
> >b/drivers/net/virtio/virtio_ring.h
> >index 7ba34662e..0f6574f68 100644
> >--- a/drivers/net/virtio/virtio_ring.h
> >+++ b/drivers/net/virtio/virtio_ring.h
> >@@ -59,7 +59,7 @@ struct vring_used_elem {
> >
> > struct vring_used {
> > uint16_t flags;
> >- volatile uint16_t idx;
> >+ uint16_t idx;
> > struct vring_used_elem ring[0];
> > };
> >
> >diff --git a/drivers/net/virtio/virtio_rxtx.c
> >b/drivers/net/virtio/virtio_rxtx.c
> >index 752faa0f6..9ba26fd95 100644
> >--- a/drivers/net/virtio/virtio_rxtx.c
> >+++ b/drivers/net/virtio/virtio_rxtx.c
> >@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
> > struct virtnet_rx *rxvq = rxq;
> > struct virtqueue *vq = rxvq->vq;
> >
> >- return VIRTQUEUE_NUSED(vq) >= offset;
> >+ return virtqueue_nused(vq) >= offset;
> > }
> >
> > void
> >@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> > if (unlikely(hw->started == 0))
> > return nb_rx;
> >
> >- nb_used = VIRTQUEUE_NUSED(vq);
> >-
> >- virtio_rmb(hw->weak_barriers);
> >+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
>
> Small nit, I don't think we need to add this comment to every occurrence of
> virtqueue_nused, what about moving it to the definition of this function?
>
> Thanks,
> Xiaolong
Will modify as this in v4.
Thanks,
Joyce
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for split vring avail idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (6 preceding siblings ...)
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-06 15:26 ` Joyce Kong
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx Joyce Kong
` (5 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-06 15:26 UTC (permalink / raw)
To: maxime.coquelin, stephen, tiwei.bie, zhihong.wang, thomas,
jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++--
lib/librte_vhost/virtio_net.c | 14 +++++---------
2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 54dc63c93..a33b078c4 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -499,8 +499,24 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx,
static inline void
vq_update_avail_idx(struct virtqueue *vq)
{
- virtio_wmb(vq->hw->weak_barriers);
- vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ if (vq->hw->weak_barriers) {
+ /* x86 prefers to using rte_smp_wmb over __atomic_store_n as
+ * it reports a slightly better perf, which comes from the
+ * saved branch by the compiler.
+ * The if and else branches are identical with the smp and
+ * cio barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ rte_smp_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+#else
+ __atomic_store_n(&vq->vq_split.ring.avail->idx,
+ vq->vq_avail_idx, __ATOMIC_RELEASE);
+#endif
+ } else {
+ rte_cio_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ }
}
static inline void
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 7f6e7f2c1..4c5380bc1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
- avail_head = *((volatile uint16_t *)&vq->avail->idx);
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- free_entries = *((volatile uint16_t *)&vq->avail->idx) -
- vq->last_avail_idx;
- if (free_entries == 0)
- return 0;
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ return 0;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (7 preceding siblings ...)
2020-04-06 15:26 ` [dpdk-dev] [PATCH v3 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
@ 2020-04-24 3:39 ` Joyce Kong
2020-04-28 16:06 ` Maxime Coquelin
2020-04-29 17:45 ` Ferruh Yigit
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for split vring used idx Joyce Kong
` (4 subsequent siblings)
13 siblings, 2 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-24 3:39 UTC (permalink / raw)
To: maxime.coquelin, stephen, xiaolong.ye, tiwei.bie, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
This patchset replaces the two-way barriers with C11 one-way barriers
for split vring idx, when the frontend and backend are implemented
in software.
By doing PVP benchmarking, the test result of 2c1q showed the throughput
increased 20% with the 0.001% of acceptable loss rate on Thunderx2
platform.[1]
By doing vhost-user + virtio-user case benchmarking, 4% performance gain
was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
of zero packet loss.[2]
[1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
[2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
PVP test with virtio 1.0 normal path
v4:
Remove some duplicated code comment.
v3:
Modify some style error.
v2:
Add test performance statistics.
Joyce Kong (2):
virtio: one way barrier for split vring used idx
virtio: one way barrier for split vring avail idx
drivers/net/virtio/virtio_ethdev.c | 7 +--
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 35 ++++-------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++---
lib/librte_vhost/virtio_net.c | 19 +++---
9 files changed, 81 insertions(+), 59 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-28 16:06 ` Maxime Coquelin
2020-04-29 17:45 ` Ferruh Yigit
1 sibling, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2020-04-28 16:06 UTC (permalink / raw)
To: Joyce Kong, stephen, xiaolong.ye, tiwei.bie, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
On 4/24/20 5:39 AM, Joyce Kong wrote:
> This patchset replaces the two-way barriers with C11 one-way barriers
> for split vring idx, when the frontend and backend are implemented
> in software.
>
> By doing PVP benchmarking, the test result of 2c1q showed the throughput
> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
> platform.[1]
>
> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
> of zero packet loss.[2]
>
> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
> v4:
> Remove some duplicated code comment.
>
> v3:
> Modify some style error.
>
> v2:
> Add test performance statistics.
>
> Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
> drivers/net/virtio/virtio_ethdev.c | 7 +--
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 35 ++++-------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 19 +++---
> 9 files changed, 81 insertions(+), 59 deletions(-)
>
Applied to dpdk-next-virtio/master
Thanks,
Maxime
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx Joyce Kong
2020-04-28 16:06 ` Maxime Coquelin
@ 2020-04-29 17:45 ` Ferruh Yigit
2020-04-30 9:09 ` Maxime Coquelin
1 sibling, 1 reply; 32+ messages in thread
From: Ferruh Yigit @ 2020-04-29 17:45 UTC (permalink / raw)
To: Joyce Kong, maxime.coquelin, stephen, xiaolong.ye, tiwei.bie,
zhihong.wang, thomas, jerinj, yinan.wang, honnappa.nagarahalli,
gavin.hu
Cc: nd, dev
On 4/24/2020 4:39 AM, Joyce Kong wrote:
> This patchset replaces the two-way barriers with C11 one-way barriers
> for split vring idx, when the frontend and backend are implemented
> in software.
>
> By doing PVP benchmarking, the test result of 2c1q showed the throughput
> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
> platform.[1]
>
> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
> of zero packet loss.[2]
>
> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
> v4:
> Remove some duplicated code comment.
>
> v3:
> Modify some style error.
>
> v2:
> Add test performance statistics.
>
> Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
Hi Joyce,
When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails. Can you
please check it?
Thanks,
ferruh
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
2020-04-29 17:45 ` Ferruh Yigit
@ 2020-04-30 9:09 ` Maxime Coquelin
2020-04-30 9:16 ` Joyce Kong
0 siblings, 1 reply; 32+ messages in thread
From: Maxime Coquelin @ 2020-04-30 9:09 UTC (permalink / raw)
To: Ferruh Yigit, Joyce Kong, stephen, xiaolong.ye, tiwei.bie,
zhihong.wang, thomas, jerinj, yinan.wang, honnappa.nagarahalli,
gavin.hu
Cc: nd, dev
On 4/29/20 7:45 PM, Ferruh Yigit wrote:
> On 4/24/2020 4:39 AM, Joyce Kong wrote:
>> This patchset replaces the two-way barriers with C11 one-way barriers
>> for split vring idx, when the frontend and backend are implemented
>> in software.
>>
>> By doing PVP benchmarking, the test result of 2c1q showed the throughput
>> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
>> platform.[1]
>>
>> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
>> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
>> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
>> of zero packet loss.[2]
>>
>> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
>> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
>> PVP test with virtio 1.0 normal path
>>
>> v4:
>> Remove some duplicated code comment.
>>
>> v3:
>> Modify some style error.
>>
>> v2:
>> Add test performance statistics.
>>
>> Joyce Kong (2):
>> virtio: one way barrier for split vring used idx
>> virtio: one way barrier for split vring avail idx
>>
>
> Hi Joyce,
>
> When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails. Can you
> please check it?
Hi Joyce,
I will need the fix today, just send a v5 for the series.
If not possible to do it today, please let me know.
Maxime
> Thanks,
> ferruh
>
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
2020-04-30 9:09 ` Maxime Coquelin
@ 2020-04-30 9:16 ` Joyce Kong
2020-04-30 9:24 ` Maxime Coquelin
0 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-30 9:16 UTC (permalink / raw)
To: Maxime Coquelin, Ferruh Yigit, stephen, xiaolong.ye, tiwei.bie,
zhihong.wang, thomas, jerinj, yinan.wang, Honnappa Nagarahalli,
Gavin Hu
Cc: nd, dev
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Thursday, April 30, 2020 5:09 PM
> To: Ferruh Yigit <ferruh.yigit@intel.com>; Joyce Kong
> <Joyce.Kong@arm.com>; stephen@networkplumber.org;
> xiaolong.ye@intel.com; tiwei.bie@intel.com; zhihong.wang@intel.com;
> thomas@monjalon.net; jerinj@marvell.com; yinan.wang@intel.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu
> <Gavin.Hu@arm.com>
> Cc: nd <nd@arm.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
>
>
>
> On 4/29/20 7:45 PM, Ferruh Yigit wrote:
> > On 4/24/2020 4:39 AM, Joyce Kong wrote:
> >> This patchset replaces the two-way barriers with C11 one-way barriers
> >> for split vring idx, when the frontend and backend are implemented in
> >> software.
> >>
> >> By doing PVP benchmarking, the test result of 2c1q showed the
> >> throughput increased 20% with the 0.001% of acceptable loss rate on
> >> Thunderx2 platform.[1]
> >>
> >> By doing vhost-user + virtio-user case benchmarking, 4% performance
> >> gain was measured on Thunderx2 platform by 2c1q RFC2544 test of
> >> 0.001% loss, and 4.7% performance was improved on Dell platform by
> >> 1c1q RFC2544 test of zero packet loss.[2]
> >>
> >> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> >>
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test
> _plan.html
> >> PVP test with virtio 1.0 normal path
> >>
> >> v4:
> >> Remove some duplicated code comment.
> >>
> >> v3:
> >> Modify some style error.
> >>
> >> v2:
> >> Add test performance statistics.
> >>
> >> Joyce Kong (2):
> >> virtio: one way barrier for split vring used idx
> >> virtio: one way barrier for split vring avail idx
> >>
> >
> > Hi Joyce,
> >
> > When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails.
> Can
> > you please check it?
>
> Hi Joyce,
>
> I will need the fix today, just send a v5 for the series.
> If not possible to do it today, please let me know.
>
> Maxime
>
Hi Maxime,
I have just fixed the issue and sent out the v5, please have a review.
Thanks.
>
> > Thanks,
> > ferruh
> >
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
2020-04-30 9:16 ` Joyce Kong
@ 2020-04-30 9:24 ` Maxime Coquelin
0 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2020-04-30 9:24 UTC (permalink / raw)
To: Joyce Kong, Ferruh Yigit, stephen, xiaolong.ye, tiwei.bie,
zhihong.wang, thomas, jerinj, yinan.wang, Honnappa Nagarahalli,
Gavin Hu
Cc: nd, dev
On 4/30/20 11:16 AM, Joyce Kong wrote:
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Thursday, April 30, 2020 5:09 PM
>> To: Ferruh Yigit <ferruh.yigit@intel.com>; Joyce Kong
>> <Joyce.Kong@arm.com>; stephen@networkplumber.org;
>> xiaolong.ye@intel.com; tiwei.bie@intel.com; zhihong.wang@intel.com;
>> thomas@monjalon.net; jerinj@marvell.com; yinan.wang@intel.com;
>> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu
>> <Gavin.Hu@arm.com>
>> Cc: nd <nd@arm.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx
>>
>>
>>
>> On 4/29/20 7:45 PM, Ferruh Yigit wrote:
>>> On 4/24/2020 4:39 AM, Joyce Kong wrote:
>>>> This patchset replaces the two-way barriers with C11 one-way barriers
>>>> for split vring idx, when the frontend and backend are implemented in
>>>> software.
>>>>
>>>> By doing PVP benchmarking, the test result of 2c1q showed the
>>>> throughput increased 20% with the 0.001% of acceptable loss rate on
>>>> Thunderx2 platform.[1]
>>>>
>>>> By doing vhost-user + virtio-user case benchmarking, 4% performance
>>>> gain was measured on Thunderx2 platform by 2c1q RFC2544 test of
>>>> 0.001% loss, and 4.7% performance was improved on Dell platform by
>>>> 1c1q RFC2544 test of zero packet loss.[2]
>>>>
>>>> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
>>>>
>> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test
>> _plan.html
>>>> PVP test with virtio 1.0 normal path
>>>>
>>>> v4:
>>>> Remove some duplicated code comment.
>>>>
>>>> v3:
>>>> Modify some style error.
>>>>
>>>> v2:
>>>> Add test performance statistics.
>>>>
>>>> Joyce Kong (2):
>>>> virtio: one way barrier for split vring used idx
>>>> virtio: one way barrier for split vring avail idx
>>>>
>>>
>>> Hi Joyce,
>>>
>>> When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails.
>> Can
>>> you please check it?
>>
>> Hi Joyce,
>>
>> I will need the fix today, just send a v5 for the series.
>> If not possible to do it today, please let me know.
>>
>> Maxime
>>
> Hi Maxime,
> I have just fixed the issue and sent out the v5, please have a review.
> Thanks.
Thanks! That was fast :)
Maxime
>>
>>> Thanks,
>>> ferruh
>>>
>
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for split vring used idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (8 preceding siblings ...)
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-24 3:39 ` Joyce Kong
2020-04-27 9:03 ` Maxime Coquelin
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
` (3 subsequent siblings)
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-24 3:39 UTC (permalink / raw)
To: maxime.coquelin, stephen, xiaolong.ye, tiwei.bie, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtio_ethdev.c | 7 +---
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 35 ++++++-----------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +--
drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 38 ++++++++++++++++---
lib/librte_vhost/virtio_net.c | 5 +--
9 files changed, 58 insertions(+), 48 deletions(-)
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index f9d0ea70d..ae9f0c315 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -285,13 +285,10 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
virtqueue_notify(vq);
- rte_rmb();
- while (VIRTQUEUE_NUSED(vq) == 0) {
- rte_rmb();
+ while (virtqueue_nused(vq) == 0)
usleep(100);
- }
- while (VIRTQUEUE_NUSED(vq)) {
+ while (virtqueue_nused(vq)) {
uint32_t idx, desc_idx, used_idx;
struct vring_used_elem *uep;
diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
index 7ba34662e..0f6574f68 100644
--- a/drivers/net/virtio/virtio_ring.h
+++ b/drivers/net/virtio/virtio_ring.h
@@ -59,7 +59,7 @@ struct vring_used_elem {
struct vring_used {
uint16_t flags;
- volatile uint16_t idx;
+ uint16_t idx;
struct vring_used_elem ring[0];
};
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 752faa0f6..0beb5deed 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
struct virtnet_rx *rxvq = rxq;
struct virtqueue *vq = rxvq->vq;
- return VIRTQUEUE_NUSED(vq) >= offset;
+ return virtqueue_nused(vq) >= offset;
}
void
@@ -1243,9 +1243,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
@@ -1458,12 +1456,10 @@ virtio_recv_pkts_inorder(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
nb_used = RTE_MIN(nb_used, nb_pkts);
nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ);
- virtio_rmb(hw->weak_barriers);
-
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
nb_enqueued = 0;
@@ -1552,8 +1548,7 @@ virtio_recv_pkts_inorder(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -1644,9 +1639,7 @@ virtio_recv_mergeable_pkts(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
@@ -1734,8 +1727,7 @@ virtio_recv_mergeable_pkts(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -2108,9 +2100,9 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
return nb_pkts;
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
+
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup(vq, nb_used);
@@ -2142,8 +2134,8 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
/* Positive value indicates it need free vring descriptors */
if (unlikely(need > 0)) {
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
+
need = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup(vq, need);
@@ -2180,11 +2172,9 @@ static __rte_always_inline int
virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need)
{
uint16_t nb_used, nb_clean, nb_descs;
- struct virtio_hw *hw = vq->hw;
nb_descs = vq->vq_free_cnt + need;
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
nb_clean = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup_inorder(vq, nb_clean);
@@ -2213,9 +2203,8 @@ virtio_xmit_pkts_inorder(void *tx_queue,
VIRTQUEUE_DUMP(vq);
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
- virtio_rmb(hw->weak_barriers);
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup_inorder(vq, nb_used);
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
index 992e71f01..363e2b330 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_neon.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
index f9ec4ae69..1056e9c20 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_sse.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -85,9 +85,7 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 1c6b26f8d..7fb135f49 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
struct vring *vring = &dev->vrings[queue_idx];
/* Consume avail ring, using used ring idx as first one */
- while (vring->used->idx != vring->avail->idx) {
- avail_idx = (vring->used->idx) & (vring->num - 1);
+ while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ != vring->avail->idx) {
+ avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ & (vring->num - 1);
desc_idx = vring->avail->ring[avail_idx];
n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx);
@@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
uep->id = desc_idx;
uep->len = n_descs;
- vring->used->idx++;
+ __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED);
}
}
diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c
index 0b4e3bf3e..b0f61dabc 100644
--- a/drivers/net/virtio/virtqueue.c
+++ b/drivers/net/virtio/virtqueue.c
@@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq)
uint16_t used_idx, desc_idx;
uint16_t nb_used, i;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
for (i = 0; i < nb_used; i++) {
used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1);
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index e3d38b5a6..f1815d3f4 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
return VTNET_TQ;
}
-#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
- (vq)->vq_used_cons_idx))
+/* virtqueue_nused has load-acquire or rte_cio_rmb insed */
+static inline uint16_t
+virtqueue_nused(const struct virtqueue *vq)
+{
+ uint16_t idx;
+
+ if (vq->hw->weak_barriers) {
+ /**
+ * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
+ * reports a slightly better perf, which comes from the saved
+ * branch by the compiler.
+ * The if and else branches are identical with the smp and cio
+ * barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ idx = vq->vq_split.ring.used->idx;
+ rte_smp_rmb();
+#else
+ idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
+ __ATOMIC_ACQUIRE);
+#endif
+ } else {
+ idx = vq->vq_split.ring.used->idx;
+ rte_cio_rmb();
+ }
+ return idx - vq->vq_used_cons_idx;
+}
void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx);
void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx);
@@ -534,7 +559,8 @@ virtqueue_notify(struct virtqueue *vq)
#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
#define VIRTQUEUE_DUMP(vq) do { \
uint16_t used_idx, nused; \
- used_idx = (vq)->vq_split.ring.used->idx; \
+ used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \
+ __ATOMIC_RELAXED);
nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
if (vtpci_packed_queue((vq)->hw)) { \
PMD_INIT_LOG(DEBUG, \
@@ -549,9 +575,9 @@ virtqueue_notify(struct virtqueue *vq)
"VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
" avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
" avail.flags=0x%x; used.flags=0x%x", \
- (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
- (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \
- (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \
+ (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \
+ (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \
+ __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \
(vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \
} while (0)
#else
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 37c47c7dc..7f6e7f2c1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
}
vq->last_used_idx += vq->shadow_used_idx;
- rte_smp_wmb();
-
vhost_log_cache_sync(dev, vq);
- *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+ __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
+ __ATOMIC_RELEASE);
vq->shadow_used_idx = 0;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for split vring used idx
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-27 9:03 ` Maxime Coquelin
0 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2020-04-27 9:03 UTC (permalink / raw)
To: Joyce Kong, stephen, xiaolong.ye, tiwei.bie, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
On 4/24/20 5:39 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for used
> index in split ring.
>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> drivers/net/virtio/virtio_ethdev.c | 7 +---
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 35 ++++++-----------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +--
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 38 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 5 +--
> 9 files changed, 58 insertions(+), 48 deletions(-)
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for split vring avail idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (9 preceding siblings ...)
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-24 3:39 ` Joyce Kong
2020-04-27 9:03 ` Maxime Coquelin
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 0/2] one way barrier for split vring idx Joyce Kong
` (2 subsequent siblings)
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-24 3:39 UTC (permalink / raw)
To: maxime.coquelin, stephen, xiaolong.ye, tiwei.bie, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++--
lib/librte_vhost/virtio_net.c | 14 +++++---------
2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index f1815d3f4..131ea71e7 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -500,8 +500,24 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx,
static inline void
vq_update_avail_idx(struct virtqueue *vq)
{
- virtio_wmb(vq->hw->weak_barriers);
- vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ if (vq->hw->weak_barriers) {
+ /* x86 prefers to using rte_smp_wmb over __atomic_store_n as
+ * it reports a slightly better perf, which comes from the
+ * saved branch by the compiler.
+ * The if and else branches are identical with the smp and
+ * cio barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ rte_smp_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+#else
+ __atomic_store_n(&vq->vq_split.ring.avail->idx,
+ vq->vq_avail_idx, __ATOMIC_RELEASE);
+#endif
+ } else {
+ rte_cio_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ }
}
static inline void
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 7f6e7f2c1..4c5380bc1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
- avail_head = *((volatile uint16_t *)&vq->avail->idx);
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- free_entries = *((volatile uint16_t *)&vq->avail->idx) -
- vq->last_avail_idx;
- if (free_entries == 0)
- return 0;
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ return 0;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for split vring avail idx
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
@ 2020-04-27 9:03 ` Maxime Coquelin
0 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2020-04-27 9:03 UTC (permalink / raw)
To: Joyce Kong, stephen, xiaolong.ye, tiwei.bie, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
On 4/24/20 5:39 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for avail
> index in split ring.
>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++--
> lib/librte_vhost/virtio_net.c | 14 +++++---------
> 2 files changed, 23 insertions(+), 11 deletions(-)
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v5 0/2] one way barrier for split vring idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (10 preceding siblings ...)
2020-04-24 3:39 ` [dpdk-dev] [PATCH v4 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
@ 2020-04-30 9:14 ` Joyce Kong
2020-04-30 20:54 ` Maxime Coquelin
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 1/2] virtio: one way barrier for split vring used idx Joyce Kong
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-30 9:14 UTC (permalink / raw)
To: maxime.coquelin, stephen, xiaolong.ye, zhihong.wang, thomas,
jerinj, yinan.wang, ferruh.yigit, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
This patchset replaces the two-way barriers with C11 one-way barriers
for split vring idx, when the frontend and backend are implemented
in software.
By doing PVP benchmarking, the test result of 2c1q showed the throughput
increased 20% with the 0.001% of acceptable loss rate on Thunderx2
platform.[1]
By doing vhost-user + virtio-user case benchmarking, 4% performance gain
was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
of zero packet loss.[2]
[1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
[2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
PVP test with virtio 1.0 normal path
v5:
Fix the compling issue when 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y'
is enabled.
v4:
Remove some duplicated code comment.
v3:
Modify some style error.
v2:
Add test performance statistics.
Joyce Kong (2):
virtio: one way barrier for split vring used idx
virtio: one way barrier for split vring avail idx
drivers/net/virtio/virtio_ethdev.c | 7 +--
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 35 ++++-------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++---
lib/librte_vhost/virtio_net.c | 19 +++---
9 files changed, 81 insertions(+), 59 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v5 0/2] one way barrier for split vring idx
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-30 20:54 ` Maxime Coquelin
0 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2020-04-30 20:54 UTC (permalink / raw)
To: Joyce Kong, stephen, xiaolong.ye, zhihong.wang, thomas, jerinj,
yinan.wang, ferruh.yigit, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
On 4/30/20 11:14 AM, Joyce Kong wrote:
> This patchset replaces the two-way barriers with C11 one-way barriers
> for split vring idx, when the frontend and backend are implemented
> in software.
>
> By doing PVP benchmarking, the test result of 2c1q showed the throughput
> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
> platform.[1]
>
> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
> of zero packet loss.[2]
>
> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
> v5:
> Fix the compling issue when 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y'
> is enabled.
>
> v4:
> Remove some duplicated code comment.
>
> v3:
> Modify some style error.
>
> v2:
> Add test performance statistics.
>
> Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
> drivers/net/virtio/virtio_ethdev.c | 7 +--
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 35 ++++-------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 19 +++---
> 9 files changed, 81 insertions(+), 59 deletions(-)
>
Applied to dpdk-next-virtio/master.
Thanks,
Maxime
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v5 1/2] virtio: one way barrier for split vring used idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (11 preceding siblings ...)
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 0/2] one way barrier for split vring idx Joyce Kong
@ 2020-04-30 9:14 ` Joyce Kong
2020-04-30 22:58 ` Ferruh Yigit
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 2/2] virtio: one way barrier for split vring avail idx Joyce Kong
13 siblings, 1 reply; 32+ messages in thread
From: Joyce Kong @ 2020-04-30 9:14 UTC (permalink / raw)
To: maxime.coquelin, stephen, xiaolong.ye, zhihong.wang, thomas,
jerinj, yinan.wang, ferruh.yigit, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for used
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
drivers/net/virtio/virtio_ethdev.c | 7 +---
drivers/net/virtio/virtio_ring.h | 2 +-
drivers/net/virtio/virtio_rxtx.c | 35 ++++++-----------
drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +--
drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
.../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
drivers/net/virtio/virtqueue.c | 2 +-
drivers/net/virtio/virtqueue.h | 38 ++++++++++++++++---
lib/librte_vhost/virtio_net.c | 5 +--
9 files changed, 58 insertions(+), 48 deletions(-)
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 37766cbb6..c36404dc6 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -289,13 +289,10 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
virtqueue_notify(vq);
- rte_rmb();
- while (VIRTQUEUE_NUSED(vq) == 0) {
- rte_rmb();
+ while (virtqueue_nused(vq) == 0)
usleep(100);
- }
- while (VIRTQUEUE_NUSED(vq)) {
+ while (virtqueue_nused(vq)) {
uint32_t idx, desc_idx, used_idx;
struct vring_used_elem *uep;
diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
index 7ba34662e..0f6574f68 100644
--- a/drivers/net/virtio/virtio_ring.h
+++ b/drivers/net/virtio/virtio_ring.h
@@ -59,7 +59,7 @@ struct vring_used_elem {
struct vring_used {
uint16_t flags;
- volatile uint16_t idx;
+ uint16_t idx;
struct vring_used_elem ring[0];
};
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 060410577..25703e616 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
struct virtnet_rx *rxvq = rxq;
struct virtqueue *vq = rxvq->vq;
- return VIRTQUEUE_NUSED(vq) >= offset;
+ return virtqueue_nused(vq) >= offset;
}
void
@@ -1243,9 +1243,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
@@ -1458,12 +1456,10 @@ virtio_recv_pkts_inorder(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
nb_used = RTE_MIN(nb_used, nb_pkts);
nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ);
- virtio_rmb(hw->weak_barriers);
-
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
nb_enqueued = 0;
@@ -1552,8 +1548,7 @@ virtio_recv_pkts_inorder(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -1644,9 +1639,7 @@ virtio_recv_mergeable_pkts(void *rx_queue,
if (unlikely(hw->started == 0))
return nb_rx;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
PMD_RX_LOG(DEBUG, "used:%d", nb_used);
@@ -1734,8 +1727,7 @@ virtio_recv_mergeable_pkts(void *rx_queue,
uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res,
VIRTIO_MBUF_BURST_SZ);
- if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
- virtio_rmb(hw->weak_barriers);
+ if (likely(virtqueue_nused(vq) >= rcv_cnt)) {
num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len,
rcv_cnt);
uint16_t extra_idx = 0;
@@ -2108,9 +2100,9 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
return nb_pkts;
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
+
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup(vq, nb_used);
@@ -2142,8 +2134,8 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
/* Positive value indicates it need free vring descriptors */
if (unlikely(need > 0)) {
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
+
need = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup(vq, need);
@@ -2180,11 +2172,9 @@ static __rte_always_inline int
virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need)
{
uint16_t nb_used, nb_clean, nb_descs;
- struct virtio_hw *hw = vq->hw;
nb_descs = vq->vq_free_cnt + need;
- nb_used = VIRTQUEUE_NUSED(vq);
- virtio_rmb(hw->weak_barriers);
+ nb_used = virtqueue_nused(vq);
nb_clean = RTE_MIN(need, (int)nb_used);
virtio_xmit_cleanup_inorder(vq, nb_clean);
@@ -2213,9 +2203,8 @@ virtio_xmit_pkts_inorder(void *tx_queue,
VIRTQUEUE_DUMP(vq);
PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
- virtio_rmb(hw->weak_barriers);
if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
virtio_xmit_cleanup_inorder(vq, nb_used);
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
index 992e71f01..363e2b330 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_neon.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_rmb();
+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
index f9ec4ae69..1056e9c20 100644
--- a/drivers/net/virtio/virtio_rxtx_simple_sse.c
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -85,9 +85,7 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
+ nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index 1c6b26f8d..7fb135f49 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
struct vring *vring = &dev->vrings[queue_idx];
/* Consume avail ring, using used ring idx as first one */
- while (vring->used->idx != vring->avail->idx) {
- avail_idx = (vring->used->idx) & (vring->num - 1);
+ while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ != vring->avail->idx) {
+ avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED)
+ & (vring->num - 1);
desc_idx = vring->avail->ring[avail_idx];
n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx);
@@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx)
uep->id = desc_idx;
uep->len = n_descs;
- vring->used->idx++;
+ __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED);
}
}
diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c
index 0b4e3bf3e..b0f61dabc 100644
--- a/drivers/net/virtio/virtqueue.c
+++ b/drivers/net/virtio/virtqueue.c
@@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq)
uint16_t used_idx, desc_idx;
uint16_t nb_used, i;
- nb_used = VIRTQUEUE_NUSED(vq);
+ nb_used = virtqueue_nused(vq);
for (i = 0; i < nb_used; i++) {
used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1);
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index e3d38b5a6..6a70299a9 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
return VTNET_TQ;
}
-#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
- (vq)->vq_used_cons_idx))
+/* virtqueue_nused has load-acquire or rte_cio_rmb insed */
+static inline uint16_t
+virtqueue_nused(const struct virtqueue *vq)
+{
+ uint16_t idx;
+
+ if (vq->hw->weak_barriers) {
+ /**
+ * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
+ * reports a slightly better perf, which comes from the saved
+ * branch by the compiler.
+ * The if and else branches are identical with the smp and cio
+ * barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ idx = vq->vq_split.ring.used->idx;
+ rte_smp_rmb();
+#else
+ idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
+ __ATOMIC_ACQUIRE);
+#endif
+ } else {
+ idx = vq->vq_split.ring.used->idx;
+ rte_cio_rmb();
+ }
+ return idx - vq->vq_used_cons_idx;
+}
void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx);
void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx);
@@ -534,7 +559,8 @@ virtqueue_notify(struct virtqueue *vq)
#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
#define VIRTQUEUE_DUMP(vq) do { \
uint16_t used_idx, nused; \
- used_idx = (vq)->vq_split.ring.used->idx; \
+ used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \
+ __ATOMIC_RELAXED); \
nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
if (vtpci_packed_queue((vq)->hw)) { \
PMD_INIT_LOG(DEBUG, \
@@ -549,9 +575,9 @@ virtqueue_notify(struct virtqueue *vq)
"VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
" avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
" avail.flags=0x%x; used.flags=0x%x", \
- (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
- (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \
- (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \
+ (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \
+ (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \
+ __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \
(vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \
} while (0)
#else
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 1fc30c681..494e574c2 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -107,11 +107,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
}
vq->last_used_idx += vq->shadow_used_idx;
- rte_smp_wmb();
-
vhost_log_cache_sync(dev, vq);
- *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+ __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
+ __ATOMIC_RELEASE);
vq->shadow_used_idx = 0;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v5 1/2] virtio: one way barrier for split vring used idx
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-30 22:58 ` Ferruh Yigit
2020-05-04 10:04 ` Maxime Coquelin
0 siblings, 1 reply; 32+ messages in thread
From: Ferruh Yigit @ 2020-04-30 22:58 UTC (permalink / raw)
To: Joyce Kong, maxime.coquelin, stephen, xiaolong.ye, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
On 4/30/2020 10:14 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for used
> index in split ring.
>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
<...>
> @@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
> return VTNET_TQ;
> }
>
> -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
> - (vq)->vq_used_cons_idx))
> +/* virtqueue_nused has load-acquire or rte_cio_rmb insed */
> +static inline uint16_t
> +virtqueue_nused(const struct virtqueue *vq)
> +{
> + uint16_t idx;
> +
> + if (vq->hw->weak_barriers) {
> + /**
> + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
> + * reports a slightly better perf, which comes from the saved
> + * branch by the compiler.
> + * The if and else branches are identical with the smp and cio
> + * barriers both defined as compiler barriers on x86.
> + */
> +#ifdef RTE_ARCH_X86_64
> + idx = vq->vq_split.ring.used->idx;
> + rte_smp_rmb();
> +#else
> + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
> + __ATOMIC_ACQUIRE);
> +#endif
> + } else {
> + idx = vq->vq_split.ring.used->idx;
> + rte_cio_rmb();
> + }
> + return idx - vq->vq_used_cons_idx;
> +}
AltiVec implementation (virtio_rxtx_simple_altivec.c) is also using
'VIRTQUEUE_NUSED' macro, it also needs to be updated with this change.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [dpdk-dev] [PATCH v5 1/2] virtio: one way barrier for split vring used idx
2020-04-30 22:58 ` Ferruh Yigit
@ 2020-05-04 10:04 ` Maxime Coquelin
0 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2020-05-04 10:04 UTC (permalink / raw)
To: Ferruh Yigit, Joyce Kong, stephen, xiaolong.ye, zhihong.wang,
thomas, jerinj, yinan.wang, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
On 5/1/20 12:58 AM, Ferruh Yigit wrote:
> On 4/30/2020 10:14 AM, Joyce Kong wrote:
>> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
>> and backend are assumed to be implemented in software, that is they can
>> run on identical CPUs in an SMP configuration.
>> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
>> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
>> and yields better performance.
>> For the above case, this patch helps yielding even better performance
>> by replacing the two-way barriers with C11 one-way barriers for used
>> index in split ring.
>>
>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>
> <...>
>
>> @@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
>> return VTNET_TQ;
>> }
>>
>> -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
>> - (vq)->vq_used_cons_idx))
>> +/* virtqueue_nused has load-acquire or rte_cio_rmb insed */
>> +static inline uint16_t
>> +virtqueue_nused(const struct virtqueue *vq)
>> +{
>> + uint16_t idx;
>> +
>> + if (vq->hw->weak_barriers) {
>> + /**
>> + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
>> + * reports a slightly better perf, which comes from the saved
>> + * branch by the compiler.
>> + * The if and else branches are identical with the smp and cio
>> + * barriers both defined as compiler barriers on x86.
>> + */
>> +#ifdef RTE_ARCH_X86_64
>> + idx = vq->vq_split.ring.used->idx;
>> + rte_smp_rmb();
>> +#else
>> + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
>> + __ATOMIC_ACQUIRE);
>> +#endif
>> + } else {
>> + idx = vq->vq_split.ring.used->idx;
>> + rte_cio_rmb();
>> + }
>> + return idx - vq->vq_used_cons_idx;
>> +}
>
> AltiVec implementation (virtio_rxtx_simple_altivec.c) is also using
> 'VIRTQUEUE_NUSED' macro, it also needs to be updated with this change.
>
I reproduced and fix the build issue.
You can fetch my tree with fixed series.
Thanks,
Maxime
^ permalink raw reply [flat|nested] 32+ messages in thread
* [dpdk-dev] [PATCH v5 2/2] virtio: one way barrier for split vring avail idx
2020-02-12 9:24 [dpdk-dev] [PATCH v1 0/2] one way barrier for split vring idx Joyce Kong
` (12 preceding siblings ...)
2020-04-30 9:14 ` [dpdk-dev] [PATCH v5 1/2] virtio: one way barrier for split vring used idx Joyce Kong
@ 2020-04-30 9:14 ` Joyce Kong
13 siblings, 0 replies; 32+ messages in thread
From: Joyce Kong @ 2020-04-30 9:14 UTC (permalink / raw)
To: maxime.coquelin, stephen, xiaolong.ye, zhihong.wang, thomas,
jerinj, yinan.wang, ferruh.yigit, honnappa.nagarahalli, gavin.hu
Cc: nd, dev
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
and backend are assumed to be implemented in software, that is they can
run on identical CPUs in an SMP configuration.
Thus a weak form of memory barriers like rte_smp_r/wmb, other than
rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
and yields better performance.
For the above case, this patch helps yielding even better performance
by replacing the two-way barriers with C11 one-way barriers for avail
index in split ring.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++--
lib/librte_vhost/virtio_net.c | 14 +++++---------
2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 6a70299a9..4fae22400 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -500,8 +500,24 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx,
static inline void
vq_update_avail_idx(struct virtqueue *vq)
{
- virtio_wmb(vq->hw->weak_barriers);
- vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ if (vq->hw->weak_barriers) {
+ /* x86 prefers to using rte_smp_wmb over __atomic_store_n as
+ * it reports a slightly better perf, which comes from the
+ * saved branch by the compiler.
+ * The if and else branches are identical with the smp and
+ * cio barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ rte_smp_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+#else
+ __atomic_store_n(&vq->vq_split.ring.avail->idx,
+ vq->vq_avail_idx, __ATOMIC_RELEASE);
+#endif
+ } else {
+ rte_cio_wmb();
+ vq->vq_split.ring.avail->idx = vq->vq_avail_idx;
+ }
}
static inline void
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 494e574c2..db3413dd9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -977,13 +977,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
- avail_head = *((volatile uint16_t *)&vq->avail->idx);
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -1698,16 +1696,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- free_entries = *((volatile uint16_t *)&vq->avail->idx) -
- vq->last_avail_idx;
- if (free_entries == 0)
- return 0;
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
- rte_smp_rmb();
+ free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ return 0;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
--
2.17.1
^ permalink raw reply [flat|nested] 32+ messages in thread