The similar changes for the packed ring were measured 10% performance lift and were merged. [1] In principle, less restrictive barriers generate better performance. However, no real performance lift was measured with the split ring on the limited number of existing HW implementations, the reason may be they are not in the critical path. To keep consistent with the barrier usage in the packed ring implementation and be future-proof, we still believe the patches make sense, we are looking forward to verify the performance when new HWs are available. [1] https://patchwork.dpdk.org/cover/59283/ Joyce Kong (2): virtio: one way barrier for split vring used idx virtio: one way barrier for split vring avail idx drivers/net/virtio/virtio_ethdev.c | 9 ++-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 46 ++++++++-------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 +-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 53 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 19 +++---- 9 files changed, 90 insertions(+), 59 deletions(-) -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for used index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtio_ethdev.c | 9 ++-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 46 +++++++++---------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 34 +++++++++++--- lib/librte_vhost/virtio_net.c | 5 +- 9 files changed, 68 insertions(+), 48 deletions(-) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index f9d0ea70d..a4a865bfa 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq, virtqueue_notify(vq); - rte_rmb(); - while (VIRTQUEUE_NUSED(vq) == 0) { - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + while (virtqueue_nused(vq) == 0) usleep(100); - } - while (VIRTQUEUE_NUSED(vq)) { + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + while (virtqueue_nused(vq)) { uint32_t idx, desc_idx, used_idx; struct vring_used_elem *uep; diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h index 7ba34662e..0f6574f68 100644 --- a/drivers/net/virtio/virtio_ring.h +++ b/drivers/net/virtio/virtio_ring.h @@ -59,7 +59,7 @@ struct vring_used_elem { struct vring_used { uint16_t flags; - volatile uint16_t idx; + uint16_t idx; struct vring_used_elem ring[0]; }; diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index 752faa0f6..9ba26fd95 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset) struct virtnet_rx *rxvq = rxq; struct virtqueue *vq = rxvq->vq; - return VIRTQUEUE_NUSED(vq) >= offset; + return virtqueue_nused(vq) >= offset; } void @@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts; if (unlikely(num > VIRTIO_MBUF_BURST_SZ)) @@ -1458,12 +1457,11 @@ virtio_recv_pkts_inorder(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); nb_used = RTE_MIN(nb_used, nb_pkts); nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ); - virtio_rmb(hw->weak_barriers); - PMD_RX_LOG(DEBUG, "used:%d", nb_used); nb_enqueued = 0; @@ -1552,8 +1550,8 @@ virtio_recv_pkts_inorder(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -1644,9 +1642,8 @@ virtio_recv_mergeable_pkts(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); PMD_RX_LOG(DEBUG, "used:%d", nb_used); @@ -1734,8 +1731,8 @@ virtio_recv_mergeable_pkts(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -2108,9 +2105,10 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_pkts; PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup(vq, nb_used); @@ -2142,8 +2140,11 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Positive value indicates it need free vring descriptors */ if (unlikely(need > 0)) { - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or + * rte_cio_rmb inside + */ + nb_used = virtqueue_nused(vq); + need = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup(vq, need); @@ -2180,11 +2181,10 @@ static __rte_always_inline int virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need) { uint16_t nb_used, nb_clean, nb_descs; - struct virtio_hw *hw = vq->hw; nb_descs = vq->vq_free_cnt + need; - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); nb_clean = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup_inorder(vq, nb_clean); @@ -2213,9 +2213,9 @@ virtio_xmit_pkts_inorder(void *tx_queue, VIRTQUEUE_DUMP(vq); PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); - virtio_rmb(hw->weak_barriers); if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup_inorder(vq, nb_used); diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c index 992e71f01..363e2b330 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_neon.c +++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c @@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c index f9ec4ae69..45a45e6f4 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_sse.c +++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c @@ -85,9 +85,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_compiler_barrier(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c index 1c6b26f8d..7fb135f49 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) struct vring *vring = &dev->vrings[queue_idx]; /* Consume avail ring, using used ring idx as first one */ - while (vring->used->idx != vring->avail->idx) { - avail_idx = (vring->used->idx) & (vring->num - 1); + while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + != vring->avail->idx) { + avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + & (vring->num - 1); desc_idx = vring->avail->ring[avail_idx]; n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx); @@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) uep->id = desc_idx; uep->len = n_descs; - vring->used->idx++; + __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED); } } diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c index 0b4e3bf3e..b0f61dabc 100644 --- a/drivers/net/virtio/virtqueue.c +++ b/drivers/net/virtio/virtqueue.c @@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq) uint16_t used_idx, desc_idx; uint16_t nb_used, i; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); for (i = 0; i < nb_used; i++) { used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1); diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 58ad7309a..13fdcb13a 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -464,8 +464,29 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) return VTNET_TQ; } -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \ - (vq)->vq_used_cons_idx)) +static inline uint16_t +virtqueue_nused(struct virtqueue *vq) +{ + uint16_t idx; + if (vq->hw->weak_barriers) { +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports + * a slightly better perf, which comes from the saved branch by the compiler. + * The if and else branches are identical with the smp and cio barriers both + * defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + idx = vq->vq_split.ring.used->idx; + rte_smp_rmb(); +#else + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, + __ATOMIC_ACQUIRE); +#endif + } else { + idx = vq->vq_split.ring.used->idx; + rte_cio_rmb(); + } + return (idx - vq->vq_used_cons_idx); +} void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx); void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx); @@ -534,7 +555,8 @@ virtqueue_notify(struct virtqueue *vq) #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP #define VIRTQUEUE_DUMP(vq) do { \ uint16_t used_idx, nused; \ - used_idx = (vq)->vq_split.ring.used->idx; \ + used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \ + __ATOMIC_RELAXED); nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \ if (vtpci_packed_queue((vq)->hw)) { \ PMD_INIT_LOG(DEBUG, \ @@ -549,9 +571,9 @@ virtqueue_notify(struct virtqueue *vq) "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \ " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \ " avail.flags=0x%x; used.flags=0x%x", \ - (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \ - (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \ - (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \ + (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \ + (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \ + __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \ (vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \ } while (0) #else diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 37c47c7dc..7f6e7f2c1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } vq->last_used_idx += vq->shadow_used_idx; - rte_smp_wmb(); - vhost_log_cache_sync(dev, vq); - *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx, + __ATOMIC_RELEASE); vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for avail index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtqueue.h | 19 +++++++++++++++++-- lib/librte_vhost/virtio_net.c | 14 +++++--------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 13fdcb13a..bbe36c107 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -496,8 +496,23 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx, static inline void vq_update_avail_idx(struct virtqueue *vq) { - virtio_wmb(vq->hw->weak_barriers); - vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + if (vq->hw->weak_barriers) { +/* x86 prefers to using rte_smp_wmb over __atomic_store_n as it reports + * a slightly better perf, which comes from the saved branch by the compiler. + * The if and else branches are identical with the smp and cio barriers both + * defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + rte_smp_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; +#else + __atomic_store_n(&vq->vq_split.ring.avail->idx, + vq->vq_avail_idx, __ATOMIC_RELEASE); +#endif + } else { + rte_cio_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + } } static inline void diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 7f6e7f2c1..4c5380bc1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - avail_head = *((volatile uint16_t *)&vq->avail->idx); - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE); rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); @@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - return 0; - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) - + vq->last_avail_idx; + if (free_entries == 0) + return 0; rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); -- 2.17.1
This patch set replaces the two-way barriers with C11 one-way barriers for split vring idx, when the frontend and backend are implemented in software. By doing PVP benchmarking, the test result showed the throughput increased 20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1] By doing vhost-user + virtio-user case benchmarking, 4% performance gain in the RFC2544 test of 0.001% of acceptable loss rate was measured on Thunderx2 platform.[2] [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html PVP test with virtio 1.0 normal path v2: Add test performance statistics. Joyce Kong (2): virtio: one way barrier for split vring used idx virtio: one way barrier for split vring avail idx drivers/net/virtio/virtio_ethdev.c | 9 ++-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 46 ++++++++-------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 +-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 53 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 19 +++---- 9 files changed, 90 insertions(+), 59 deletions(-) -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for used index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtio_ethdev.c | 9 ++-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 46 +++++++++---------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 34 +++++++++++--- lib/librte_vhost/virtio_net.c | 5 +- 9 files changed, 68 insertions(+), 48 deletions(-) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index f9d0ea70d..a4a865bfa 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq, virtqueue_notify(vq); - rte_rmb(); - while (VIRTQUEUE_NUSED(vq) == 0) { - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + while (virtqueue_nused(vq) == 0) usleep(100); - } - while (VIRTQUEUE_NUSED(vq)) { + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + while (virtqueue_nused(vq)) { uint32_t idx, desc_idx, used_idx; struct vring_used_elem *uep; diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h index 7ba34662e..0f6574f68 100644 --- a/drivers/net/virtio/virtio_ring.h +++ b/drivers/net/virtio/virtio_ring.h @@ -59,7 +59,7 @@ struct vring_used_elem { struct vring_used { uint16_t flags; - volatile uint16_t idx; + uint16_t idx; struct vring_used_elem ring[0]; }; diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index 752faa0f6..9ba26fd95 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset) struct virtnet_rx *rxvq = rxq; struct virtqueue *vq = rxvq->vq; - return VIRTQUEUE_NUSED(vq) >= offset; + return virtqueue_nused(vq) >= offset; } void @@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts; if (unlikely(num > VIRTIO_MBUF_BURST_SZ)) @@ -1458,12 +1457,11 @@ virtio_recv_pkts_inorder(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); nb_used = RTE_MIN(nb_used, nb_pkts); nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ); - virtio_rmb(hw->weak_barriers); - PMD_RX_LOG(DEBUG, "used:%d", nb_used); nb_enqueued = 0; @@ -1552,8 +1550,8 @@ virtio_recv_pkts_inorder(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -1644,9 +1642,8 @@ virtio_recv_mergeable_pkts(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); PMD_RX_LOG(DEBUG, "used:%d", nb_used); @@ -1734,8 +1731,8 @@ virtio_recv_mergeable_pkts(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -2108,9 +2105,10 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_pkts; PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup(vq, nb_used); @@ -2142,8 +2140,11 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Positive value indicates it need free vring descriptors */ if (unlikely(need > 0)) { - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or + * rte_cio_rmb inside + */ + nb_used = virtqueue_nused(vq); + need = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup(vq, need); @@ -2180,11 +2181,10 @@ static __rte_always_inline int virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need) { uint16_t nb_used, nb_clean, nb_descs; - struct virtio_hw *hw = vq->hw; nb_descs = vq->vq_free_cnt + need; - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); nb_clean = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup_inorder(vq, nb_clean); @@ -2213,9 +2213,9 @@ virtio_xmit_pkts_inorder(void *tx_queue, VIRTQUEUE_DUMP(vq); PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); - virtio_rmb(hw->weak_barriers); if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup_inorder(vq, nb_used); diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c index 992e71f01..363e2b330 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_neon.c +++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c @@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c index f9ec4ae69..45a45e6f4 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_sse.c +++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c @@ -85,9 +85,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_compiler_barrier(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c index 1c6b26f8d..7fb135f49 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) struct vring *vring = &dev->vrings[queue_idx]; /* Consume avail ring, using used ring idx as first one */ - while (vring->used->idx != vring->avail->idx) { - avail_idx = (vring->used->idx) & (vring->num - 1); + while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + != vring->avail->idx) { + avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + & (vring->num - 1); desc_idx = vring->avail->ring[avail_idx]; n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx); @@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) uep->id = desc_idx; uep->len = n_descs; - vring->used->idx++; + __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED); } } diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c index 0b4e3bf3e..b0f61dabc 100644 --- a/drivers/net/virtio/virtqueue.c +++ b/drivers/net/virtio/virtqueue.c @@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq) uint16_t used_idx, desc_idx; uint16_t nb_used, i; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); for (i = 0; i < nb_used; i++) { used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1); diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 58ad7309a..13fdcb13a 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -464,8 +464,29 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) return VTNET_TQ; } -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \ - (vq)->vq_used_cons_idx)) +static inline uint16_t +virtqueue_nused(struct virtqueue *vq) +{ + uint16_t idx; + if (vq->hw->weak_barriers) { +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports + * a slightly better perf, which comes from the saved branch by the compiler. + * The if and else branches are identical with the smp and cio barriers both + * defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + idx = vq->vq_split.ring.used->idx; + rte_smp_rmb(); +#else + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, + __ATOMIC_ACQUIRE); +#endif + } else { + idx = vq->vq_split.ring.used->idx; + rte_cio_rmb(); + } + return (idx - vq->vq_used_cons_idx); +} void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx); void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx); @@ -534,7 +555,8 @@ virtqueue_notify(struct virtqueue *vq) #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP #define VIRTQUEUE_DUMP(vq) do { \ uint16_t used_idx, nused; \ - used_idx = (vq)->vq_split.ring.used->idx; \ + used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \ + __ATOMIC_RELAXED); nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \ if (vtpci_packed_queue((vq)->hw)) { \ PMD_INIT_LOG(DEBUG, \ @@ -549,9 +571,9 @@ virtqueue_notify(struct virtqueue *vq) "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \ " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \ " avail.flags=0x%x; used.flags=0x%x", \ - (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \ - (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \ - (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \ + (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \ + (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \ + __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \ (vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \ } while (0) #else diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 37c47c7dc..7f6e7f2c1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } vq->last_used_idx += vq->shadow_used_idx; - rte_smp_wmb(); - vhost_log_cache_sync(dev, vq); - *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx, + __ATOMIC_RELEASE); vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for avail index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtqueue.h | 19 +++++++++++++++++-- lib/librte_vhost/virtio_net.c | 14 +++++--------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 13fdcb13a..bbe36c107 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -496,8 +496,23 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx, static inline void vq_update_avail_idx(struct virtqueue *vq) { - virtio_wmb(vq->hw->weak_barriers); - vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + if (vq->hw->weak_barriers) { +/* x86 prefers to using rte_smp_wmb over __atomic_store_n as it reports + * a slightly better perf, which comes from the saved branch by the compiler. + * The if and else branches are identical with the smp and cio barriers both + * defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + rte_smp_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; +#else + __atomic_store_n(&vq->vq_split.ring.avail->idx, + vq->vq_avail_idx, __ATOMIC_RELEASE); +#endif + } else { + rte_cio_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + } } static inline void diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 7f6e7f2c1..4c5380bc1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - avail_head = *((volatile uint16_t *)&vq->avail->idx); - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE); rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); @@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - return 0; - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) - + vq->last_avail_idx; + if (free_entries == 0) + return 0; rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); -- 2.17.1
On Thu, 2 Apr 2020 10:57:52 +0800 Joyce Kong <joyce.kong@arm.com> wrote: > - (vq)->vq_used_cons_idx)) > +static inline uint16_t > +virtqueue_nused(struct virtqueue *vq) vq is unmodified and should be const > +{ > + uint16_t idx; > + if (vq->hw->weak_barriers) { Put blank line between declaration and if statement > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports > + * a slightly better perf, which comes from the saved branch by the compiler. > + * The if and else branches are identical with the smp and cio barriers both > + * defined as compiler barriers on x86. > + */ Do not put comments on left margin (except in function prolog). > +#ifdef RTE_ARCH_X86_64 > + idx = vq->vq_split.ring.used->idx; > + rte_smp_rmb(); > +#else > + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, > + __ATOMIC_ACQUIRE); > +#endif > + } else { > + idx = vq->vq_split.ring.used->idx; > + rte_cio_rmb(); > + } > + return (idx - vq->vq_used_cons_idx); Parenthesis around arguments to return are unnecessary. BSD code likes it, Linux style does not. > +} This kind of arch specific code is hard to maintain. Does it really make that much difference.
Hi Stephen, > -----Original Message----- > From: Stephen Hemminger <stephen@networkplumber.org> > Sent: Thursday, April 2, 2020 11:48 PM > To: Joyce Kong <Joyce.Kong@arm.com> > Cc: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com; thomas@monjalon.net; jerinj@marvell.com; > yinan.wang@intel.com; Honnappa Nagarahalli > <Honnappa.Nagarahalli@arm.com>; Gavin Hu <Gavin.Hu@arm.com>; nd > <nd@arm.com>; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for split vring > used idx > > On Thu, 2 Apr 2020 10:57:52 +0800 > Joyce Kong <joyce.kong@arm.com> wrote: > > > - (vq)->vq_used_cons_idx)) > > +static inline uint16_t > > +virtqueue_nused(struct virtqueue *vq) > vq is unmodified and should be const > > > +{ > > + uint16_t idx; > > + if (vq->hw->weak_barriers) { > Put blank line between declaration and if statement Will fix in v3. > > > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it reports > > + * a slightly better perf, which comes from the saved branch by the > compiler. > > + * The if and else branches are identical with the smp and cio barriers > both > > + * defined as compiler barriers on x86. > > + */ > > Do not put comments on left margin (except in function prolog). Will fix in v3. > > > +#ifdef RTE_ARCH_X86_64 > > + idx = vq->vq_split.ring.used->idx; > > + rte_smp_rmb(); > > +#else > > + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, > > + __ATOMIC_ACQUIRE); > > +#endif > > + } else { > > + idx = vq->vq_split.ring.used->idx; > > + rte_cio_rmb(); > > + } > > + return (idx - vq->vq_used_cons_idx); > > Parenthesis around arguments to return are unnecessary. > BSD code likes it, Linux style does not. Will fix in v3. > > > +} > > This kind of arch specific code is hard to maintain. > Does it really make that much difference. Yes, a stronger than required barrier is a performance killer, especially in the fast path. The test was conducted on the ThunderX2+Intel XL710 testbed, the PVP test case. /Gavin
This patch set replaces the two-way barriers with C11 one-way barriers for split vring idx, when the frontend and backend are implemented in software. By doing PVP benchmarking, the test result showed the throughput increased 20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1] By doing vhost-user + virtio-user case benchmarking, 4% performance gain in the RFC2544 test of 0.001% of acceptable loss rate was measured on Thunderx2 platform.[2] [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html PVP test with virtio 1.0 normal path v3: Modify some style error. v2: Add test performance statistics. Joyce Kong (2): virtio: one way barrier for split vring used idx virtio: one way barrier for split vring avail idx drivers/net/virtio/virtio_ethdev.c | 9 ++- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 46 +++++++-------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 57 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 19 +++---- 9 files changed, 94 insertions(+), 59 deletions(-) -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for used index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtio_ethdev.c | 9 ++-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 46 +++++++++---------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 37 ++++++++++++--- lib/librte_vhost/virtio_net.c | 5 +- 9 files changed, 71 insertions(+), 48 deletions(-) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index f9d0ea70d..a4a865bfa 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq, virtqueue_notify(vq); - rte_rmb(); - while (VIRTQUEUE_NUSED(vq) == 0) { - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + while (virtqueue_nused(vq) == 0) usleep(100); - } - while (VIRTQUEUE_NUSED(vq)) { + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + while (virtqueue_nused(vq)) { uint32_t idx, desc_idx, used_idx; struct vring_used_elem *uep; diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h index 7ba34662e..0f6574f68 100644 --- a/drivers/net/virtio/virtio_ring.h +++ b/drivers/net/virtio/virtio_ring.h @@ -59,7 +59,7 @@ struct vring_used_elem { struct vring_used { uint16_t flags; - volatile uint16_t idx; + uint16_t idx; struct vring_used_elem ring[0]; }; diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index 752faa0f6..9ba26fd95 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset) struct virtnet_rx *rxvq = rxq; struct virtqueue *vq = rxvq->vq; - return VIRTQUEUE_NUSED(vq) >= offset; + return virtqueue_nused(vq) >= offset; } void @@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts; if (unlikely(num > VIRTIO_MBUF_BURST_SZ)) @@ -1458,12 +1457,11 @@ virtio_recv_pkts_inorder(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); nb_used = RTE_MIN(nb_used, nb_pkts); nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ); - virtio_rmb(hw->weak_barriers); - PMD_RX_LOG(DEBUG, "used:%d", nb_used); nb_enqueued = 0; @@ -1552,8 +1550,8 @@ virtio_recv_pkts_inorder(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -1644,9 +1642,8 @@ virtio_recv_mergeable_pkts(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); PMD_RX_LOG(DEBUG, "used:%d", nb_used); @@ -1734,8 +1731,8 @@ virtio_recv_mergeable_pkts(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -2108,9 +2105,10 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_pkts; PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup(vq, nb_used); @@ -2142,8 +2140,11 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Positive value indicates it need free vring descriptors */ if (unlikely(need > 0)) { - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or + * rte_cio_rmb inside + */ + nb_used = virtqueue_nused(vq); + need = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup(vq, need); @@ -2180,11 +2181,10 @@ static __rte_always_inline int virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need) { uint16_t nb_used, nb_clean, nb_descs; - struct virtio_hw *hw = vq->hw; nb_descs = vq->vq_free_cnt + need; - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); nb_clean = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup_inorder(vq, nb_clean); @@ -2213,9 +2213,9 @@ virtio_xmit_pkts_inorder(void *tx_queue, VIRTQUEUE_DUMP(vq); PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); - virtio_rmb(hw->weak_barriers); if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup_inorder(vq, nb_used); diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c index 992e71f01..363e2b330 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_neon.c +++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c @@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c index f9ec4ae69..45a45e6f4 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_sse.c +++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c @@ -85,9 +85,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_compiler_barrier(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c index 1c6b26f8d..7fb135f49 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) struct vring *vring = &dev->vrings[queue_idx]; /* Consume avail ring, using used ring idx as first one */ - while (vring->used->idx != vring->avail->idx) { - avail_idx = (vring->used->idx) & (vring->num - 1); + while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + != vring->avail->idx) { + avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + & (vring->num - 1); desc_idx = vring->avail->ring[avail_idx]; n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx); @@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) uep->id = desc_idx; uep->len = n_descs; - vring->used->idx++; + __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED); } } diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c index 0b4e3bf3e..b0f61dabc 100644 --- a/drivers/net/virtio/virtqueue.c +++ b/drivers/net/virtio/virtqueue.c @@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq) uint16_t used_idx, desc_idx; uint16_t nb_used, i; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); for (i = 0; i < nb_used; i++) { used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1); diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 58ad7309a..54dc63c93 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -464,8 +464,32 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) return VTNET_TQ; } -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \ - (vq)->vq_used_cons_idx)) +static inline uint16_t +virtqueue_nused(const struct virtqueue *vq) +{ + uint16_t idx; + + if (vq->hw->weak_barriers) { + /** + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it + * reports a slightly better perf, which comes from the saved + * branch by the compiler. + * The if and else branches are identical with the smp and cio + * barriers both defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + idx = vq->vq_split.ring.used->idx; + rte_smp_rmb(); +#else + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, + __ATOMIC_ACQUIRE); +#endif + } else { + idx = vq->vq_split.ring.used->idx; + rte_cio_rmb(); + } + return idx - vq->vq_used_cons_idx; +} void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx); void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx); @@ -534,7 +558,8 @@ virtqueue_notify(struct virtqueue *vq) #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP #define VIRTQUEUE_DUMP(vq) do { \ uint16_t used_idx, nused; \ - used_idx = (vq)->vq_split.ring.used->idx; \ + used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \ + __ATOMIC_RELAXED); nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \ if (vtpci_packed_queue((vq)->hw)) { \ PMD_INIT_LOG(DEBUG, \ @@ -549,9 +574,9 @@ virtqueue_notify(struct virtqueue *vq) "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \ " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \ " avail.flags=0x%x; used.flags=0x%x", \ - (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \ - (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \ - (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \ + (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \ + (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \ + __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \ (vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \ } while (0) #else diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 37c47c7dc..7f6e7f2c1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } vq->last_used_idx += vq->shadow_used_idx; - rte_smp_wmb(); - vhost_log_cache_sync(dev, vq); - *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx, + __ATOMIC_RELEASE); vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for avail index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++-- lib/librte_vhost/virtio_net.c | 14 +++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 54dc63c93..a33b078c4 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -499,8 +499,24 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx, static inline void vq_update_avail_idx(struct virtqueue *vq) { - virtio_wmb(vq->hw->weak_barriers); - vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + if (vq->hw->weak_barriers) { + /* x86 prefers to using rte_smp_wmb over __atomic_store_n as + * it reports a slightly better perf, which comes from the + * saved branch by the compiler. + * The if and else branches are identical with the smp and + * cio barriers both defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + rte_smp_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; +#else + __atomic_store_n(&vq->vq_split.ring.avail->idx, + vq->vq_avail_idx, __ATOMIC_RELEASE); +#endif + } else { + rte_cio_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + } } static inline void diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 7f6e7f2c1..4c5380bc1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - avail_head = *((volatile uint16_t *)&vq->avail->idx); - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE); rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); @@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - return 0; - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) - + vq->last_avail_idx; + if (free_entries == 0) + return 0; rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); -- 2.17.1
<snip> > > Subject: Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for > > split vring used idx > > > > On Thu, 2 Apr 2020 10:57:52 +0800 > > Joyce Kong <joyce.kong@arm.com> wrote: > > > > > -(vq)->vq_used_cons_idx)) > > > +static inline uint16_t > > > +virtqueue_nused(struct virtqueue *vq) > > vq is unmodified and should be const > > > > > +{ > > > +uint16_t idx; > > > +if (vq->hw->weak_barriers) { > > Put blank line between declaration and if statement > Will fix in v3. > > > > > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it > > > +reports > > > + * a slightly better perf, which comes from the saved branch by the > > compiler. > > > + * The if and else branches are identical with the smp and cio > > > + barriers > > both > > > + * defined as compiler barriers on x86. > > > + */ > > > > Do not put comments on left margin (except in function prolog). > Will fix in v3. > > > > > +#ifdef RTE_ARCH_X86_64 > > > +idx = vq->vq_split.ring.used->idx; > > > +rte_smp_rmb(); > > > +#else > > > +idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, > > > +__ATOMIC_ACQUIRE); > > > +#endif > > > +} else { > > > +idx = vq->vq_split.ring.used->idx; > > > +rte_cio_rmb(); > > > +} > > > +return (idx - vq->vq_used_cons_idx); > > > > Parenthesis around arguments to return are unnecessary. > > BSD code likes it, Linux style does not. > Will fix in v3. > > > > > +} > > > > This kind of arch specific code is hard to maintain. > > Does it really make that much difference. > Yes, a stronger than required barrier is a performance killer, especially in the > fast path. > The test was conducted on the ThunderX2+Intel XL710 testbed, the PVP test > case. I think if the performance deference is not much we should stay with C11 built-ins for x86. How much is the performance difference on x86? > /Gavin
<snip> > > > Subject: Re: [dpdk-dev] [PATCH v2 1/2] virtio: one way barrier for > > > split vring used idx > > > > > > On Thu, 2 Apr 2020 10:57:52 +0800 > > > Joyce Kong <joyce.kong@arm.com> wrote: > > > > > > > -(vq)->vq_used_cons_idx)) > > > > +static inline uint16_t > > > > +virtqueue_nused(struct virtqueue *vq) > > > vq is unmodified and should be const > > > > > > > +{ > > > > +uint16_t idx; > > > > +if (vq->hw->weak_barriers) { > > > Put blank line between declaration and if statement > > Will fix in v3. > > > > > > > +/* x86 prefers to using rte_smp_rmb over __atomic_load_n as it > > > > +reports > > > > + * a slightly better perf, which comes from the saved branch by > > > > +the > > > compiler. > > > > + * The if and else branches are identical with the smp and cio > > > > + barriers > > > both > > > > + * defined as compiler barriers on x86. > > > > + */ > > > > > > Do not put comments on left margin (except in function prolog). > > Will fix in v3. > > > > > > > +#ifdef RTE_ARCH_X86_64 > > > > +idx = vq->vq_split.ring.used->idx; rte_smp_rmb(); #else idx = > > > +__atomic_load_n(&(vq)->vq_split.ring.used->idx, > > > > +__ATOMIC_ACQUIRE); > > > > +#endif > > > > +} else { > > > > +idx = vq->vq_split.ring.used->idx; rte_cio_rmb(); } return (idx - > > > > +vq->vq_used_cons_idx); > > > > > > Parenthesis around arguments to return are unnecessary. > > > BSD code likes it, Linux style does not. > > Will fix in v3. > > > > > > > +} > > > > > > This kind of arch specific code is hard to maintain. > > > Does it really make that much difference. > > Yes, a stronger than required barrier is a performance killer, > > especially in the fast path. > > The test was conducted on the ThunderX2+Intel XL710 testbed, the PVP > > test case. > > /Gavin > I think if the performance deference is not much we should stay with C11 built-ins for x86. How much is the > performance difference on x86? According to intel's PVP test for packed ring, ~1.5% perf loss would be caused on x86 if staying with __atomic_load_n. So rte_smp_rmb is kept for x86 in packed ring patches which have been merged and the similar change is done in split ring patches.
Hi, Joyce Thanks for the patch. On 04/06, Joyce Kong wrote: >This patch set replaces the two-way barriers with C11 one-way barriers >for split vring idx, when the frontend and backend are implemented >in software. > >By doing PVP benchmarking, the test result showed the throughput increased >20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1] > >By doing vhost-user + virtio-user case benchmarking, 4% performance gain >in the RFC2544 test of 0.001% of acceptable loss rate was measured on >Thunderx2 platform.[2] One quick question is do you have the performance number on IA platform since the code touches the common part of the virtio PMD? Thanks, Xiaolong > >[1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html >[2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html > PVP test with virtio 1.0 normal path > >v3: >Modify some style error. > >v2: >Add test performance statistics. > >Joyce Kong (2): > virtio: one way barrier for split vring used idx > virtio: one way barrier for split vring avail idx > > drivers/net/virtio/virtio_ethdev.c | 9 ++- > drivers/net/virtio/virtio_ring.h | 2 +- > drivers/net/virtio/virtio_rxtx.c | 46 +++++++-------- > drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- > drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +- > .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++- > drivers/net/virtio/virtqueue.c | 2 +- > drivers/net/virtio/virtqueue.h | 57 ++++++++++++++++--- > lib/librte_vhost/virtio_net.c | 19 +++---- > 9 files changed, 94 insertions(+), 59 deletions(-) > >-- >2.17.1 >
On 04/06, Joyce Kong wrote:
>In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
>and backend are assumed to be implemented in software, that is they can
>run on identical CPUs in an SMP configuration.
>Thus a weak form of memory barriers like rte_smp_r/wmb, other than
>rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
>and yields better performance.
>For the above case, this patch helps yielding even better performance
>by replacing the two-way barriers with C11 one-way barriers for used
>index in split ring.
>
>Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>---
> drivers/net/virtio/virtio_ethdev.c | 9 ++--
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 37 ++++++++++++---
> lib/librte_vhost/virtio_net.c | 5 +-
> 9 files changed, 71 insertions(+), 48 deletions(-)
>
>diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
>index f9d0ea70d..a4a865bfa 100644
>--- a/drivers/net/virtio/virtio_ethdev.c
>+++ b/drivers/net/virtio/virtio_ethdev.c
>@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl *cvq,
>
> virtqueue_notify(vq);
>
>- rte_rmb();
>- while (VIRTQUEUE_NUSED(vq) == 0) {
>- rte_rmb();
>+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
>+ while (virtqueue_nused(vq) == 0)
> usleep(100);
>- }
>
>- while (VIRTQUEUE_NUSED(vq)) {
>+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
>+ while (virtqueue_nused(vq)) {
> uint32_t idx, desc_idx, used_idx;
> struct vring_used_elem *uep;
>
>diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h
>index 7ba34662e..0f6574f68 100644
>--- a/drivers/net/virtio/virtio_ring.h
>+++ b/drivers/net/virtio/virtio_ring.h
>@@ -59,7 +59,7 @@ struct vring_used_elem {
>
> struct vring_used {
> uint16_t flags;
>- volatile uint16_t idx;
>+ uint16_t idx;
> struct vring_used_elem ring[0];
> };
>
>diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
>index 752faa0f6..9ba26fd95 100644
>--- a/drivers/net/virtio/virtio_rxtx.c
>+++ b/drivers/net/virtio/virtio_rxtx.c
>@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
> struct virtnet_rx *rxvq = rxq;
> struct virtqueue *vq = rxvq->vq;
>
>- return VIRTQUEUE_NUSED(vq) >= offset;
>+ return virtqueue_nused(vq) >= offset;
> }
>
> void
>@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> if (unlikely(hw->started == 0))
> return nb_rx;
>
>- nb_used = VIRTQUEUE_NUSED(vq);
>-
>- virtio_rmb(hw->weak_barriers);
>+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
Small nit, I don't think we need to add this comment to every occurrence of
virtqueue_nused, what about moving it to the definition of this function?
Thanks,
Xiaolong
> -----Original Message-----
> From: Ye Xiaolong <xiaolong.ye@intel.com>
> Sent: Friday, April 17, 2020 2:52 PM
> To: Joyce Kong <Joyce.Kong@arm.com>
> Cc: maxime.coquelin@redhat.com; stephen@networkplumber.org;
> tiwei.bie@intel.com; zhihong.wang@intel.com; thomas@monjalon.net;
> jerinj@marvell.com; yinan.wang@intel.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu <Gavin.Hu@arm.com>; nd
> <nd@arm.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 1/2] virtio: one way barrier for split vring
> used idx
>
> On 04/06, Joyce Kong wrote:
> >In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the
> >frontend and backend are assumed to be implemented in software, that is
> >they can run on identical CPUs in an SMP configuration.
> >Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> >rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> >and yields better performance.
> >For the above case, this patch helps yielding even better performance
> >by replacing the two-way barriers with C11 one-way barriers for used
> >index in split ring.
> >
> >Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> >Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> >---
> > drivers/net/virtio/virtio_ethdev.c | 9 ++--
> > drivers/net/virtio/virtio_ring.h | 2 +-
> > drivers/net/virtio/virtio_rxtx.c | 46 +++++++++----------
> > drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> > drivers/net/virtio/virtio_rxtx_simple_sse.c | 5 +-
> > .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
> > drivers/net/virtio/virtqueue.c | 2 +-
> > drivers/net/virtio/virtqueue.h | 37 ++++++++++++---
> > lib/librte_vhost/virtio_net.c | 5 +-
> > 9 files changed, 71 insertions(+), 48 deletions(-)
> >
> >diff --git a/drivers/net/virtio/virtio_ethdev.c
> >b/drivers/net/virtio/virtio_ethdev.c
> >index f9d0ea70d..a4a865bfa 100644
> >--- a/drivers/net/virtio/virtio_ethdev.c
> >+++ b/drivers/net/virtio/virtio_ethdev.c
> >@@ -285,13 +285,12 @@ virtio_send_command_split(struct virtnet_ctl
> >*cvq,
> >
> > virtqueue_notify(vq);
> >
> >- rte_rmb();
> >- while (VIRTQUEUE_NUSED(vq) == 0) {
> >- rte_rmb();
> >+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
> >+ while (virtqueue_nused(vq) == 0)
> > usleep(100);
> >- }
> >
> >- while (VIRTQUEUE_NUSED(vq)) {
> >+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
> >+ while (virtqueue_nused(vq)) {
> > uint32_t idx, desc_idx, used_idx;
> > struct vring_used_elem *uep;
> >
> >diff --git a/drivers/net/virtio/virtio_ring.h
> >b/drivers/net/virtio/virtio_ring.h
> >index 7ba34662e..0f6574f68 100644
> >--- a/drivers/net/virtio/virtio_ring.h
> >+++ b/drivers/net/virtio/virtio_ring.h
> >@@ -59,7 +59,7 @@ struct vring_used_elem {
> >
> > struct vring_used {
> > uint16_t flags;
> >- volatile uint16_t idx;
> >+ uint16_t idx;
> > struct vring_used_elem ring[0];
> > };
> >
> >diff --git a/drivers/net/virtio/virtio_rxtx.c
> >b/drivers/net/virtio/virtio_rxtx.c
> >index 752faa0f6..9ba26fd95 100644
> >--- a/drivers/net/virtio/virtio_rxtx.c
> >+++ b/drivers/net/virtio/virtio_rxtx.c
> >@@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
> > struct virtnet_rx *rxvq = rxq;
> > struct virtqueue *vq = rxvq->vq;
> >
> >- return VIRTQUEUE_NUSED(vq) >= offset;
> >+ return virtqueue_nused(vq) >= offset;
> > }
> >
> > void
> >@@ -1243,9 +1243,8 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> > if (unlikely(hw->started == 0))
> > return nb_rx;
> >
> >- nb_used = VIRTQUEUE_NUSED(vq);
> >-
> >- virtio_rmb(hw->weak_barriers);
> >+ /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */
>
> Small nit, I don't think we need to add this comment to every occurrence of
> virtqueue_nused, what about moving it to the definition of this function?
>
> Thanks,
> Xiaolong
Will modify as this in v4.
Thanks,
Joyce
This patchset replaces the two-way barriers with C11 one-way barriers for split vring idx, when the frontend and backend are implemented in software. By doing PVP benchmarking, the test result of 2c1q showed the throughput increased 20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1] By doing vhost-user + virtio-user case benchmarking, 4% performance gain was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss, and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test of zero packet loss.[2] [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html PVP test with virtio 1.0 normal path v4: Remove some duplicated code comment. v3: Modify some style error. v2: Add test performance statistics. Joyce Kong (2): virtio: one way barrier for split vring used idx virtio: one way barrier for split vring avail idx drivers/net/virtio/virtio_ethdev.c | 7 +-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 35 ++++------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 19 +++--- 9 files changed, 81 insertions(+), 59 deletions(-) -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for used index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtio_ethdev.c | 7 +--- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 35 ++++++----------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-- drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 38 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 5 +-- 9 files changed, 58 insertions(+), 48 deletions(-) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index f9d0ea70d..ae9f0c315 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -285,13 +285,10 @@ virtio_send_command_split(struct virtnet_ctl *cvq, virtqueue_notify(vq); - rte_rmb(); - while (VIRTQUEUE_NUSED(vq) == 0) { - rte_rmb(); + while (virtqueue_nused(vq) == 0) usleep(100); - } - while (VIRTQUEUE_NUSED(vq)) { + while (virtqueue_nused(vq)) { uint32_t idx, desc_idx, used_idx; struct vring_used_elem *uep; diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h index 7ba34662e..0f6574f68 100644 --- a/drivers/net/virtio/virtio_ring.h +++ b/drivers/net/virtio/virtio_ring.h @@ -59,7 +59,7 @@ struct vring_used_elem { struct vring_used { uint16_t flags; - volatile uint16_t idx; + uint16_t idx; struct vring_used_elem ring[0]; }; diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index 752faa0f6..0beb5deed 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset) struct virtnet_rx *rxvq = rxq; struct virtqueue *vq = rxvq->vq; - return VIRTQUEUE_NUSED(vq) >= offset; + return virtqueue_nused(vq) >= offset; } void @@ -1243,9 +1243,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts; if (unlikely(num > VIRTIO_MBUF_BURST_SZ)) @@ -1458,12 +1456,10 @@ virtio_recv_pkts_inorder(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); nb_used = RTE_MIN(nb_used, nb_pkts); nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ); - virtio_rmb(hw->weak_barriers); - PMD_RX_LOG(DEBUG, "used:%d", nb_used); nb_enqueued = 0; @@ -1552,8 +1548,7 @@ virtio_recv_pkts_inorder(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -1644,9 +1639,7 @@ virtio_recv_mergeable_pkts(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); PMD_RX_LOG(DEBUG, "used:%d", nb_used); @@ -1734,8 +1727,7 @@ virtio_recv_mergeable_pkts(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -2108,9 +2100,9 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_pkts; PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup(vq, nb_used); @@ -2142,8 +2134,8 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Positive value indicates it need free vring descriptors */ if (unlikely(need > 0)) { - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); + need = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup(vq, need); @@ -2180,11 +2172,9 @@ static __rte_always_inline int virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need) { uint16_t nb_used, nb_clean, nb_descs; - struct virtio_hw *hw = vq->hw; nb_descs = vq->vq_free_cnt + need; - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); nb_clean = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup_inorder(vq, nb_clean); @@ -2213,9 +2203,8 @@ virtio_xmit_pkts_inorder(void *tx_queue, VIRTQUEUE_DUMP(vq); PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); - virtio_rmb(hw->weak_barriers); if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup_inorder(vq, nb_used); diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c index 992e71f01..363e2b330 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_neon.c +++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c @@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c index f9ec4ae69..1056e9c20 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_sse.c +++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c @@ -85,9 +85,7 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_compiler_barrier(); + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c index 1c6b26f8d..7fb135f49 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) struct vring *vring = &dev->vrings[queue_idx]; /* Consume avail ring, using used ring idx as first one */ - while (vring->used->idx != vring->avail->idx) { - avail_idx = (vring->used->idx) & (vring->num - 1); + while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + != vring->avail->idx) { + avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + & (vring->num - 1); desc_idx = vring->avail->ring[avail_idx]; n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx); @@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) uep->id = desc_idx; uep->len = n_descs; - vring->used->idx++; + __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED); } } diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c index 0b4e3bf3e..b0f61dabc 100644 --- a/drivers/net/virtio/virtqueue.c +++ b/drivers/net/virtio/virtqueue.c @@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq) uint16_t used_idx, desc_idx; uint16_t nb_used, i; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); for (i = 0; i < nb_used; i++) { used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1); diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index e3d38b5a6..f1815d3f4 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) return VTNET_TQ; } -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \ - (vq)->vq_used_cons_idx)) +/* virtqueue_nused has load-acquire or rte_cio_rmb insed */ +static inline uint16_t +virtqueue_nused(const struct virtqueue *vq) +{ + uint16_t idx; + + if (vq->hw->weak_barriers) { + /** + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it + * reports a slightly better perf, which comes from the saved + * branch by the compiler. + * The if and else branches are identical with the smp and cio + * barriers both defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + idx = vq->vq_split.ring.used->idx; + rte_smp_rmb(); +#else + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, + __ATOMIC_ACQUIRE); +#endif + } else { + idx = vq->vq_split.ring.used->idx; + rte_cio_rmb(); + } + return idx - vq->vq_used_cons_idx; +} void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx); void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx); @@ -534,7 +559,8 @@ virtqueue_notify(struct virtqueue *vq) #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP #define VIRTQUEUE_DUMP(vq) do { \ uint16_t used_idx, nused; \ - used_idx = (vq)->vq_split.ring.used->idx; \ + used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \ + __ATOMIC_RELAXED); nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \ if (vtpci_packed_queue((vq)->hw)) { \ PMD_INIT_LOG(DEBUG, \ @@ -549,9 +575,9 @@ virtqueue_notify(struct virtqueue *vq) "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \ " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \ " avail.flags=0x%x; used.flags=0x%x", \ - (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \ - (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \ - (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \ + (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \ + (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \ + __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \ (vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \ } while (0) #else diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 37c47c7dc..7f6e7f2c1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -77,11 +77,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } vq->last_used_idx += vq->shadow_used_idx; - rte_smp_wmb(); - vhost_log_cache_sync(dev, vq); - *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx, + __ATOMIC_RELEASE); vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for avail index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++-- lib/librte_vhost/virtio_net.c | 14 +++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index f1815d3f4..131ea71e7 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -500,8 +500,24 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx, static inline void vq_update_avail_idx(struct virtqueue *vq) { - virtio_wmb(vq->hw->weak_barriers); - vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + if (vq->hw->weak_barriers) { + /* x86 prefers to using rte_smp_wmb over __atomic_store_n as + * it reports a slightly better perf, which comes from the + * saved branch by the compiler. + * The if and else branches are identical with the smp and + * cio barriers both defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + rte_smp_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; +#else + __atomic_store_n(&vq->vq_split.ring.avail->idx, + vq->vq_avail_idx, __ATOMIC_RELEASE); +#endif + } else { + rte_cio_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + } } static inline void diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 7f6e7f2c1..4c5380bc1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -991,13 +991,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - avail_head = *((volatile uint16_t *)&vq->avail->idx); - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE); rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); @@ -1712,16 +1710,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - return 0; - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) - + vq->last_avail_idx; + if (free_entries == 0) + return 0; rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); -- 2.17.1
On 4/24/20 5:39 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for used
> index in split ring.
>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> drivers/net/virtio/virtio_ethdev.c | 7 +---
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 35 ++++++-----------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +--
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++--
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 38 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 5 +--
> 9 files changed, 58 insertions(+), 48 deletions(-)
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
On 4/24/20 5:39 AM, Joyce Kong wrote:
> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
> and backend are assumed to be implemented in software, that is they can
> run on identical CPUs in an SMP configuration.
> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
> and yields better performance.
> For the above case, this patch helps yielding even better performance
> by replacing the two-way barriers with C11 one-way barriers for avail
> index in split ring.
>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++--
> lib/librte_vhost/virtio_net.c | 14 +++++---------
> 2 files changed, 23 insertions(+), 11 deletions(-)
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
On 4/24/20 5:39 AM, Joyce Kong wrote:
> This patchset replaces the two-way barriers with C11 one-way barriers
> for split vring idx, when the frontend and backend are implemented
> in software.
>
> By doing PVP benchmarking, the test result of 2c1q showed the throughput
> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
> platform.[1]
>
> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
> of zero packet loss.[2]
>
> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
> v4:
> Remove some duplicated code comment.
>
> v3:
> Modify some style error.
>
> v2:
> Add test performance statistics.
>
> Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
> drivers/net/virtio/virtio_ethdev.c | 7 +--
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 35 ++++-------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 19 +++---
> 9 files changed, 81 insertions(+), 59 deletions(-)
>
Applied to dpdk-next-virtio/master
Thanks,
Maxime
On 4/24/2020 4:39 AM, Joyce Kong wrote:
> This patchset replaces the two-way barriers with C11 one-way barriers
> for split vring idx, when the frontend and backend are implemented
> in software.
>
> By doing PVP benchmarking, the test result of 2c1q showed the throughput
> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
> platform.[1]
>
> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
> of zero packet loss.[2]
>
> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
> v4:
> Remove some duplicated code comment.
>
> v3:
> Modify some style error.
>
> v2:
> Add test performance statistics.
>
> Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
Hi Joyce,
When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails. Can you
please check it?
Thanks,
ferruh
On 4/29/20 7:45 PM, Ferruh Yigit wrote: > On 4/24/2020 4:39 AM, Joyce Kong wrote: >> This patchset replaces the two-way barriers with C11 one-way barriers >> for split vring idx, when the frontend and backend are implemented >> in software. >> >> By doing PVP benchmarking, the test result of 2c1q showed the throughput >> increased 20% with the 0.001% of acceptable loss rate on Thunderx2 >> platform.[1] >> >> By doing vhost-user + virtio-user case benchmarking, 4% performance gain >> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss, >> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test >> of zero packet loss.[2] >> >> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html >> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html >> PVP test with virtio 1.0 normal path >> >> v4: >> Remove some duplicated code comment. >> >> v3: >> Modify some style error. >> >> v2: >> Add test performance statistics. >> >> Joyce Kong (2): >> virtio: one way barrier for split vring used idx >> virtio: one way barrier for split vring avail idx >> > > Hi Joyce, > > When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails. Can you > please check it? Hi Joyce, I will need the fix today, just send a v5 for the series. If not possible to do it today, please let me know. Maxime > Thanks, > ferruh >
This patchset replaces the two-way barriers with C11 one-way barriers for split vring idx, when the frontend and backend are implemented in software. By doing PVP benchmarking, the test result of 2c1q showed the throughput increased 20% with the 0.001% of acceptable loss rate on Thunderx2 platform.[1] By doing vhost-user + virtio-user case benchmarking, 4% performance gain was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss, and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test of zero packet loss.[2] [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html PVP test with virtio 1.0 normal path v5: Fix the compling issue when 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' is enabled. v4: Remove some duplicated code comment. v3: Modify some style error. v2: Add test performance statistics. Joyce Kong (2): virtio: one way barrier for split vring used idx virtio: one way barrier for split vring avail idx drivers/net/virtio/virtio_ethdev.c | 7 +-- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 35 ++++------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +- drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 19 +++--- 9 files changed, 81 insertions(+), 59 deletions(-) -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for used index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- drivers/net/virtio/virtio_ethdev.c | 7 +--- drivers/net/virtio/virtio_ring.h | 2 +- drivers/net/virtio/virtio_rxtx.c | 35 ++++++----------- drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-- drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +- .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-- drivers/net/virtio/virtqueue.c | 2 +- drivers/net/virtio/virtqueue.h | 38 ++++++++++++++++--- lib/librte_vhost/virtio_net.c | 5 +-- 9 files changed, 58 insertions(+), 48 deletions(-) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index 37766cbb6..c36404dc6 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -289,13 +289,10 @@ virtio_send_command_split(struct virtnet_ctl *cvq, virtqueue_notify(vq); - rte_rmb(); - while (VIRTQUEUE_NUSED(vq) == 0) { - rte_rmb(); + while (virtqueue_nused(vq) == 0) usleep(100); - } - while (VIRTQUEUE_NUSED(vq)) { + while (virtqueue_nused(vq)) { uint32_t idx, desc_idx, used_idx; struct vring_used_elem *uep; diff --git a/drivers/net/virtio/virtio_ring.h b/drivers/net/virtio/virtio_ring.h index 7ba34662e..0f6574f68 100644 --- a/drivers/net/virtio/virtio_ring.h +++ b/drivers/net/virtio/virtio_ring.h @@ -59,7 +59,7 @@ struct vring_used_elem { struct vring_used { uint16_t flags; - volatile uint16_t idx; + uint16_t idx; struct vring_used_elem ring[0]; }; diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index 060410577..25703e616 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -45,7 +45,7 @@ virtio_dev_rx_queue_done(void *rxq, uint16_t offset) struct virtnet_rx *rxvq = rxq; struct virtqueue *vq = rxvq->vq; - return VIRTQUEUE_NUSED(vq) >= offset; + return virtqueue_nused(vq) >= offset; } void @@ -1243,9 +1243,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts; if (unlikely(num > VIRTIO_MBUF_BURST_SZ)) @@ -1458,12 +1456,10 @@ virtio_recv_pkts_inorder(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); nb_used = RTE_MIN(nb_used, nb_pkts); nb_used = RTE_MIN(nb_used, VIRTIO_MBUF_BURST_SZ); - virtio_rmb(hw->weak_barriers); - PMD_RX_LOG(DEBUG, "used:%d", nb_used); nb_enqueued = 0; @@ -1552,8 +1548,7 @@ virtio_recv_pkts_inorder(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_rx_inorder(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -1644,9 +1639,7 @@ virtio_recv_mergeable_pkts(void *rx_queue, if (unlikely(hw->started == 0)) return nb_rx; - nb_used = VIRTQUEUE_NUSED(vq); - - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); PMD_RX_LOG(DEBUG, "used:%d", nb_used); @@ -1734,8 +1727,7 @@ virtio_recv_mergeable_pkts(void *rx_queue, uint16_t rcv_cnt = RTE_MIN((uint16_t)seg_res, VIRTIO_MBUF_BURST_SZ); - if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { - virtio_rmb(hw->weak_barriers); + if (likely(virtqueue_nused(vq) >= rcv_cnt)) { num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, rcv_cnt); uint16_t extra_idx = 0; @@ -2108,9 +2100,9 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_pkts; PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup(vq, nb_used); @@ -2142,8 +2134,8 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Positive value indicates it need free vring descriptors */ if (unlikely(need > 0)) { - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); + need = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup(vq, need); @@ -2180,11 +2172,9 @@ static __rte_always_inline int virtio_xmit_try_cleanup_inorder(struct virtqueue *vq, uint16_t need) { uint16_t nb_used, nb_clean, nb_descs; - struct virtio_hw *hw = vq->hw; nb_descs = vq->vq_free_cnt + need; - nb_used = VIRTQUEUE_NUSED(vq); - virtio_rmb(hw->weak_barriers); + nb_used = virtqueue_nused(vq); nb_clean = RTE_MIN(need, (int)nb_used); virtio_xmit_cleanup_inorder(vq, nb_clean); @@ -2213,9 +2203,8 @@ virtio_xmit_pkts_inorder(void *tx_queue, VIRTQUEUE_DUMP(vq); PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); - virtio_rmb(hw->weak_barriers); if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) virtio_xmit_cleanup_inorder(vq, nb_used); diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c index 992e71f01..363e2b330 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_neon.c +++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c @@ -83,9 +83,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_rmb(); + /* virtqueue_nused has a load-acquire or rte_cio_rmb inside */ + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c index f9ec4ae69..1056e9c20 100644 --- a/drivers/net/virtio/virtio_rxtx_simple_sse.c +++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c @@ -85,9 +85,7 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; - nb_used = VIRTQUEUE_NUSED(vq); - - rte_compiler_barrier(); + nb_used = virtqueue_nused(vq); if (unlikely(nb_used == 0)) return 0; diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c index 1c6b26f8d..7fb135f49 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -730,8 +730,10 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) struct vring *vring = &dev->vrings[queue_idx]; /* Consume avail ring, using used ring idx as first one */ - while (vring->used->idx != vring->avail->idx) { - avail_idx = (vring->used->idx) & (vring->num - 1); + while (__atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + != vring->avail->idx) { + avail_idx = __atomic_load_n(&vring->used->idx, __ATOMIC_RELAXED) + & (vring->num - 1); desc_idx = vring->avail->ring[avail_idx]; n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx); @@ -741,6 +743,6 @@ virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) uep->id = desc_idx; uep->len = n_descs; - vring->used->idx++; + __atomic_add_fetch(&vring->used->idx, 1, __ATOMIC_RELAXED); } } diff --git a/drivers/net/virtio/virtqueue.c b/drivers/net/virtio/virtqueue.c index 0b4e3bf3e..b0f61dabc 100644 --- a/drivers/net/virtio/virtqueue.c +++ b/drivers/net/virtio/virtqueue.c @@ -92,7 +92,7 @@ virtqueue_rxvq_flush_split(struct virtqueue *vq) uint16_t used_idx, desc_idx; uint16_t nb_used, i; - nb_used = VIRTQUEUE_NUSED(vq); + nb_used = virtqueue_nused(vq); for (i = 0; i < nb_used; i++) { used_idx = vq->vq_used_cons_idx & (vq->vq_nentries - 1); diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index e3d38b5a6..6a70299a9 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) return VTNET_TQ; } -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \ - (vq)->vq_used_cons_idx)) +/* virtqueue_nused has load-acquire or rte_cio_rmb insed */ +static inline uint16_t +virtqueue_nused(const struct virtqueue *vq) +{ + uint16_t idx; + + if (vq->hw->weak_barriers) { + /** + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it + * reports a slightly better perf, which comes from the saved + * branch by the compiler. + * The if and else branches are identical with the smp and cio + * barriers both defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + idx = vq->vq_split.ring.used->idx; + rte_smp_rmb(); +#else + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, + __ATOMIC_ACQUIRE); +#endif + } else { + idx = vq->vq_split.ring.used->idx; + rte_cio_rmb(); + } + return idx - vq->vq_used_cons_idx; +} void vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx); void vq_ring_free_chain_packed(struct virtqueue *vq, uint16_t used_idx); @@ -534,7 +559,8 @@ virtqueue_notify(struct virtqueue *vq) #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP #define VIRTQUEUE_DUMP(vq) do { \ uint16_t used_idx, nused; \ - used_idx = (vq)->vq_split.ring.used->idx; \ + used_idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, \ + __ATOMIC_RELAXED); \ nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \ if (vtpci_packed_queue((vq)->hw)) { \ PMD_INIT_LOG(DEBUG, \ @@ -549,9 +575,9 @@ virtqueue_notify(struct virtqueue *vq) "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \ " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \ " avail.flags=0x%x; used.flags=0x%x", \ - (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \ - (vq)->vq_desc_head_idx, (vq)->vq_split.ring.avail->idx, \ - (vq)->vq_used_cons_idx, (vq)->vq_split.ring.used->idx, \ + (vq)->vq_nentries, (vq)->vq_free_cnt, nused, (vq)->vq_desc_head_idx, \ + (vq)->vq_split.ring.avail->idx, (vq)->vq_used_cons_idx, \ + __atomic_load_n(&(vq)->vq_split.ring.used->idx, __ATOMIC_RELAXED), \ (vq)->vq_split.ring.avail->flags, (vq)->vq_split.ring.used->flags); \ } while (0) #else diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1fc30c681..494e574c2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -107,11 +107,10 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } vq->last_used_idx += vq->shadow_used_idx; - rte_smp_wmb(); - vhost_log_cache_sync(dev, vq); - *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx, + __ATOMIC_RELEASE); vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); -- 2.17.1
In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend and backend are assumed to be implemented in software, that is they can run on identical CPUs in an SMP configuration. Thus a weak form of memory barriers like rte_smp_r/wmb, other than rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) and yields better performance. For the above case, this patch helps yielding even better performance by replacing the two-way barriers with C11 one-way barriers for avail index in split ring. Signed-off-by: Joyce Kong <joyce.kong@arm.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- drivers/net/virtio/virtqueue.h | 20 ++++++++++++++++++-- lib/librte_vhost/virtio_net.c | 14 +++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h index 6a70299a9..4fae22400 100644 --- a/drivers/net/virtio/virtqueue.h +++ b/drivers/net/virtio/virtqueue.h @@ -500,8 +500,24 @@ void vq_ring_free_inorder(struct virtqueue *vq, uint16_t desc_idx, static inline void vq_update_avail_idx(struct virtqueue *vq) { - virtio_wmb(vq->hw->weak_barriers); - vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + if (vq->hw->weak_barriers) { + /* x86 prefers to using rte_smp_wmb over __atomic_store_n as + * it reports a slightly better perf, which comes from the + * saved branch by the compiler. + * The if and else branches are identical with the smp and + * cio barriers both defined as compiler barriers on x86. + */ +#ifdef RTE_ARCH_X86_64 + rte_smp_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; +#else + __atomic_store_n(&vq->vq_split.ring.avail->idx, + vq->vq_avail_idx, __ATOMIC_RELEASE); +#endif + } else { + rte_cio_wmb(); + vq->vq_split.ring.avail->idx = vq->vq_avail_idx; + } } static inline void diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 494e574c2..db3413dd9 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -977,13 +977,11 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - avail_head = *((volatile uint16_t *)&vq->avail->idx); - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE); rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); @@ -1698,16 +1696,14 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - return 0; - /* * The ordering between avail index and * desc reads needs to be enforced. */ - rte_smp_rmb(); + free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) - + vq->last_avail_idx; + if (free_entries == 0) + return 0; rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); -- 2.17.1
> -----Original Message----- > From: Maxime Coquelin <maxime.coquelin@redhat.com> > Sent: Thursday, April 30, 2020 5:09 PM > To: Ferruh Yigit <ferruh.yigit@intel.com>; Joyce Kong > <Joyce.Kong@arm.com>; stephen@networkplumber.org; > xiaolong.ye@intel.com; tiwei.bie@intel.com; zhihong.wang@intel.com; > thomas@monjalon.net; jerinj@marvell.com; yinan.wang@intel.com; > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu > <Gavin.Hu@arm.com> > Cc: nd <nd@arm.com>; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx > > > > On 4/29/20 7:45 PM, Ferruh Yigit wrote: > > On 4/24/2020 4:39 AM, Joyce Kong wrote: > >> This patchset replaces the two-way barriers with C11 one-way barriers > >> for split vring idx, when the frontend and backend are implemented in > >> software. > >> > >> By doing PVP benchmarking, the test result of 2c1q showed the > >> throughput increased 20% with the 0.001% of acceptable loss rate on > >> Thunderx2 platform.[1] > >> > >> By doing vhost-user + virtio-user case benchmarking, 4% performance > >> gain was measured on Thunderx2 platform by 2c1q RFC2544 test of > >> 0.001% loss, and 4.7% performance was improved on Dell platform by > >> 1c1q RFC2544 test of zero packet loss.[2] > >> > >> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html > >> > [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test > _plan.html > >> PVP test with virtio 1.0 normal path > >> > >> v4: > >> Remove some duplicated code comment. > >> > >> v3: > >> Modify some style error. > >> > >> v2: > >> Add test performance statistics. > >> > >> Joyce Kong (2): > >> virtio: one way barrier for split vring used idx > >> virtio: one way barrier for split vring avail idx > >> > > > > Hi Joyce, > > > > When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails. > Can > > you please check it? > > Hi Joyce, > > I will need the fix today, just send a v5 for the series. > If not possible to do it today, please let me know. > > Maxime > Hi Maxime, I have just fixed the issue and sent out the v5, please have a review. Thanks. > > > Thanks, > > ferruh > >
On 4/30/20 11:16 AM, Joyce Kong wrote: >> -----Original Message----- >> From: Maxime Coquelin <maxime.coquelin@redhat.com> >> Sent: Thursday, April 30, 2020 5:09 PM >> To: Ferruh Yigit <ferruh.yigit@intel.com>; Joyce Kong >> <Joyce.Kong@arm.com>; stephen@networkplumber.org; >> xiaolong.ye@intel.com; tiwei.bie@intel.com; zhihong.wang@intel.com; >> thomas@monjalon.net; jerinj@marvell.com; yinan.wang@intel.com; >> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu >> <Gavin.Hu@arm.com> >> Cc: nd <nd@arm.com>; dev@dpdk.org >> Subject: Re: [dpdk-dev] [PATCH v4 0/2] one way barrier for split vring idx >> >> >> >> On 4/29/20 7:45 PM, Ferruh Yigit wrote: >>> On 4/24/2020 4:39 AM, Joyce Kong wrote: >>>> This patchset replaces the two-way barriers with C11 one-way barriers >>>> for split vring idx, when the frontend and backend are implemented in >>>> software. >>>> >>>> By doing PVP benchmarking, the test result of 2c1q showed the >>>> throughput increased 20% with the 0.001% of acceptable loss rate on >>>> Thunderx2 platform.[1] >>>> >>>> By doing vhost-user + virtio-user case benchmarking, 4% performance >>>> gain was measured on Thunderx2 platform by 2c1q RFC2544 test of >>>> 0.001% loss, and 4.7% performance was improved on Dell platform by >>>> 1c1q RFC2544 test of zero packet loss.[2] >>>> >>>> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html >>>> >> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test >> _plan.html >>>> PVP test with virtio 1.0 normal path >>>> >>>> v4: >>>> Remove some duplicated code comment. >>>> >>>> v3: >>>> Modify some style error. >>>> >>>> v2: >>>> Add test performance statistics. >>>> >>>> Joyce Kong (2): >>>> virtio: one way barrier for split vring used idx >>>> virtio: one way barrier for split vring avail idx >>>> >>> >>> Hi Joyce, >>> >>> When 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y' enabled, build fails. >> Can >>> you please check it? >> >> Hi Joyce, >> >> I will need the fix today, just send a v5 for the series. >> If not possible to do it today, please let me know. >> >> Maxime >> > Hi Maxime, > I have just fixed the issue and sent out the v5, please have a review. > Thanks. Thanks! That was fast :) Maxime >> >>> Thanks, >>> ferruh >>> >
On 4/30/20 11:14 AM, Joyce Kong wrote:
> This patchset replaces the two-way barriers with C11 one-way barriers
> for split vring idx, when the frontend and backend are implemented
> in software.
>
> By doing PVP benchmarking, the test result of 2c1q showed the throughput
> increased 20% with the 0.001% of acceptable loss rate on Thunderx2
> platform.[1]
>
> By doing vhost-user + virtio-user case benchmarking, 4% performance gain
> was measured on Thunderx2 platform by 2c1q RFC2544 test of 0.001% loss,
> and 4.7% performance was improved on Dell platform by 1c1q RFC2544 test
> of zero packet loss.[2]
>
> [1]https://doc.dpdk.org/guides/howto/pvp_reference_benchmark.html
> [2]https://doc.dpdk.org/dts/test_plans/pvp_multi_paths_performance_test_plan.html
> PVP test with virtio 1.0 normal path
>
> v5:
> Fix the compling issue when 'CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=y'
> is enabled.
>
> v4:
> Remove some duplicated code comment.
>
> v3:
> Modify some style error.
>
> v2:
> Add test performance statistics.
>
> Joyce Kong (2):
> virtio: one way barrier for split vring used idx
> virtio: one way barrier for split vring avail idx
>
> drivers/net/virtio/virtio_ethdev.c | 7 +--
> drivers/net/virtio/virtio_ring.h | 2 +-
> drivers/net/virtio/virtio_rxtx.c | 35 ++++-------
> drivers/net/virtio/virtio_rxtx_simple_neon.c | 5 +-
> drivers/net/virtio/virtio_rxtx_simple_sse.c | 4 +-
> .../net/virtio/virtio_user/virtio_user_dev.c | 8 ++-
> drivers/net/virtio/virtqueue.c | 2 +-
> drivers/net/virtio/virtqueue.h | 58 ++++++++++++++++---
> lib/librte_vhost/virtio_net.c | 19 +++---
> 9 files changed, 81 insertions(+), 59 deletions(-)
>
Applied to dpdk-next-virtio/master.
Thanks,
Maxime
On 4/30/2020 10:14 AM, Joyce Kong wrote: > In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend > and backend are assumed to be implemented in software, that is they can > run on identical CPUs in an SMP configuration. > Thus a weak form of memory barriers like rte_smp_r/wmb, other than > rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1) > and yields better performance. > For the above case, this patch helps yielding even better performance > by replacing the two-way barriers with C11 one-way barriers for used > index in split ring. > > Signed-off-by: Joyce Kong <joyce.kong@arm.com> > Reviewed-by: Gavin Hu <gavin.hu@arm.com> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> <...> > @@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) > return VTNET_TQ; > } > > -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \ > - (vq)->vq_used_cons_idx)) > +/* virtqueue_nused has load-acquire or rte_cio_rmb insed */ > +static inline uint16_t > +virtqueue_nused(const struct virtqueue *vq) > +{ > + uint16_t idx; > + > + if (vq->hw->weak_barriers) { > + /** > + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it > + * reports a slightly better perf, which comes from the saved > + * branch by the compiler. > + * The if and else branches are identical with the smp and cio > + * barriers both defined as compiler barriers on x86. > + */ > +#ifdef RTE_ARCH_X86_64 > + idx = vq->vq_split.ring.used->idx; > + rte_smp_rmb(); > +#else > + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx, > + __ATOMIC_ACQUIRE); > +#endif > + } else { > + idx = vq->vq_split.ring.used->idx; > + rte_cio_rmb(); > + } > + return idx - vq->vq_used_cons_idx; > +} AltiVec implementation (virtio_rxtx_simple_altivec.c) is also using 'VIRTQUEUE_NUSED' macro, it also needs to be updated with this change.
On 5/1/20 12:58 AM, Ferruh Yigit wrote:
> On 4/30/2020 10:14 AM, Joyce Kong wrote:
>> In case VIRTIO_F_ORDER_PLATFORM(36) is not negotiated, then the frontend
>> and backend are assumed to be implemented in software, that is they can
>> run on identical CPUs in an SMP configuration.
>> Thus a weak form of memory barriers like rte_smp_r/wmb, other than
>> rte_cio_r/wmb, is sufficient for this case(vq->hw->weak_barriers == 1)
>> and yields better performance.
>> For the above case, this patch helps yielding even better performance
>> by replacing the two-way barriers with C11 one-way barriers for used
>> index in split ring.
>>
>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>
> <...>
>
>> @@ -464,8 +464,33 @@ virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
>> return VTNET_TQ;
>> }
>>
>> -#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
>> - (vq)->vq_used_cons_idx))
>> +/* virtqueue_nused has load-acquire or rte_cio_rmb insed */
>> +static inline uint16_t
>> +virtqueue_nused(const struct virtqueue *vq)
>> +{
>> + uint16_t idx;
>> +
>> + if (vq->hw->weak_barriers) {
>> + /**
>> + * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
>> + * reports a slightly better perf, which comes from the saved
>> + * branch by the compiler.
>> + * The if and else branches are identical with the smp and cio
>> + * barriers both defined as compiler barriers on x86.
>> + */
>> +#ifdef RTE_ARCH_X86_64
>> + idx = vq->vq_split.ring.used->idx;
>> + rte_smp_rmb();
>> +#else
>> + idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
>> + __ATOMIC_ACQUIRE);
>> +#endif
>> + } else {
>> + idx = vq->vq_split.ring.used->idx;
>> + rte_cio_rmb();
>> + }
>> + return idx - vq->vq_used_cons_idx;
>> +}
>
> AltiVec implementation (virtio_rxtx_simple_altivec.c) is also using
> 'VIRTQUEUE_NUSED' macro, it also needs to be updated with this change.
>
I reproduced and fix the build issue.
You can fetch my tree with fixed series.
Thanks,
Maxime