Backport the entire series [0] 'vhost: I-cache pressure optimizations' because it is motivated by a performance degradation from v17.11. Also pick [1] to allow for a clean backport of the GPA logging support. The result is an almost clean backport. Only the lack of [2] generates conflicts. But that is part of a feature [3] that, IMHO, does not meet the requirements to be backported to the stable branch. [0] https://patches.dpdk.org/cover/53807/ [1] b473ec113 vhost: batch used descs chains write-back with packed ring [2] b13ad2dec vhost: provide helpers for virtio ring relay [3] https://patches.dpdk.org/cover/48747/ Adrian Moreno (1): vhost: convert buffer addresses to GPA for logging Maxime Coquelin (6): vhost: batch used descs chains write-back with packed ring vhost: un-inline dirty pages logging functions vhost: do not inline packed and split functions vhost: do not inline unlikely fragmented buffers code vhost: simplify descriptor buffer prefetching eal/x86: force inlining of all memcpy and mov helpers .../common/include/arch/x86/rte_memcpy.h | 18 +- lib/librte_vhost/rte_vhost.h | 2 +- lib/librte_vhost/vhost.c | 204 ++++++++++++++++++ lib/librte_vhost/vhost.h | 164 ++++---------- lib/librte_vhost/virtio_net.c | 202 ++++++++--------- 5 files changed, 343 insertions(+), 247 deletions(-) -- 2.21.0
From: Maxime Coquelin <maxime.coquelin@redhat.com> [ upstream commit b473ec1131ee44ee25e0536a04be65246b93f4f3 ] Instead of writing back descriptors chains in order, let's write the first chain flags last in order to improve batching. Also, move the write barrier in logging cache sync, so that it is done only when logging is enabled. It means there is now one more barrier for split ring when logging is enabled. With Kernel's pktgen benchmark, ~3% performance gain is measured. Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Tiwei Bie <tiwei.bie@intel.com> --- lib/librte_vhost/vhost.h | 7 ++----- lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index daccf5c65..47d64bad3 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -456,12 +456,9 @@ vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) !dev->log_base)) return; - log_base = (unsigned long *)(uintptr_t)dev->log_base; + rte_smp_wmb(); - /* - * It is expected a write memory barrier has been issued - * before this function is called. - */ + log_base = (unsigned long *)(uintptr_t)dev->log_base; for (i = 0; i < vq->log_cache_nb_elem; i++) { struct log_cache_entry *elem = vq->log_cache + i; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 740db2ed7..6ae617698 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -136,6 +136,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, { int i; uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; /* Split loop in two to save memory barriers */ for (i = 0; i < vq->shadow_used_idx; i++) { @@ -165,12 +167,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, flags &= ~VRING_DESC_F_AVAIL; } - vq->desc_packed[vq->last_used_idx].flags = flags; + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; - vhost_log_cache_used_vring(dev, vq, + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * sizeof(struct vring_packed_desc), sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } vq->last_used_idx += vq->shadow_used_packed[i].count; if (vq->last_used_idx >= vq->size) { @@ -179,7 +186,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, } } - rte_smp_wmb(); + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; vhost_log_cache_sync(dev, vq); } -- 2.21.0
From: Maxime Coquelin <maxime.coquelin@redhat.com> [ upstream commit 094b643d9b425c942aa18d1d229832f8d96940af ] In order to reduce the I-cache pressure, this patch removes the inlining of the dirty pages logging functions, that we can consider as cold path. Indeed, these functions are only called while doing live migration, so not called most of the time. Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Tiwei Bie <tiwei.bie@intel.com> --- lib/librte_vhost/vhost.c | 131 +++++++++++++++++++++++++++++++++++++++ lib/librte_vhost/vhost.h | 129 ++++---------------------------------- 2 files changed, 143 insertions(+), 117 deletions(-) diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 5dc82197b..fb21e1ddd 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -69,6 +69,137 @@ __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +#define VHOST_LOG_PAGE 4096 + +/* + * Atomically set a bit in memory. + */ +static __rte_always_inline void +vhost_set_bit(unsigned int nr, volatile uint8_t *addr) +{ +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) + /* + * __sync_ built-ins are deprecated, but __atomic_ ones + * are sub-optimized in older GCC versions. + */ + __sync_fetch_and_or_1(addr, (1U << nr)); +#else + __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED); +#endif +} + +static __rte_always_inline void +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + vhost_set_bit(page % 8, &log_base[page / 8]); +} + +void +__vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (unlikely(!dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +void +__vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + unsigned long *log_base; + int i; + + if (unlikely(!dev->log_base)) + return; + + rte_smp_wmb(); + + log_base = (unsigned long *)(uintptr_t)dev->log_base; + + for (i = 0; i < vq->log_cache_nb_elem; i++) { + struct log_cache_entry *elem = vq->log_cache + i; + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) + /* + * '__sync' builtins are deprecated, but '__atomic' ones + * are sub-optimized in older GCC versions. + */ + __sync_fetch_and_or(log_base + elem->offset, elem->val); +#else + __atomic_fetch_or(log_base + elem->offset, elem->val, + __ATOMIC_RELAXED); +#endif + } + + rte_smp_wmb(); + + vq->log_cache_nb_elem = 0; +} + +static __rte_always_inline void +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t page) +{ + uint32_t bit_nr = page % (sizeof(unsigned long) << 3); + uint32_t offset = page / (sizeof(unsigned long) << 3); + int i; + + for (i = 0; i < vq->log_cache_nb_elem; i++) { + struct log_cache_entry *elem = vq->log_cache + i; + + if (elem->offset == offset) { + elem->val |= (1UL << bit_nr); + return; + } + } + + if (unlikely(i >= VHOST_LOG_CACHE_NR)) { + /* + * No more room for a new log cache entry, + * so write the dirty log map directly. + */ + rte_smp_wmb(); + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + + return; + } + + vq->log_cache[i].offset = offset; + vq->log_cache[i].val = (1UL << bit_nr); + vq->log_cache_nb_elem++; +} + +void +__vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (unlikely(!dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_cache_page(dev, vq, page); + page += 1; + } +} + void cleanup_vq(struct vhost_virtqueue *vq, int destroy) { diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 47d64bad3..a4f69f98a 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -399,138 +399,33 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) wrap_counter != !!(flags & VRING_DESC_F_USED); } -#define VHOST_LOG_PAGE 4096 - -/* - * Atomically set a bit in memory. - */ -static __rte_always_inline void -vhost_set_bit(unsigned int nr, volatile uint8_t *addr) -{ -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) - /* - * __sync_ built-ins are deprecated, but __atomic_ ones - * are sub-optimized in older GCC versions. - */ - __sync_fetch_and_or_1(addr, (1U << nr)); -#else - __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED); -#endif -} - -static __rte_always_inline void -vhost_log_page(uint8_t *log_base, uint64_t page) -{ - vhost_set_bit(page % 8, &log_base[page / 8]); -} +void __vhost_log_cache_write(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t addr, uint64_t len); +void __vhost_log_cache_sync(struct virtio_net *dev, + struct vhost_virtqueue *vq); +void __vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len); static __rte_always_inline void vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) { - uint64_t page; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base || !len)) - return; - - if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) - return; - - /* To make sure guest memory updates are committed before logging */ - rte_smp_wmb(); - - page = addr / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < addr + len) { - vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); - page += 1; - } + if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) + __vhost_log_write(dev, addr, len); } static __rte_always_inline void vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) { - unsigned long *log_base; - int i; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base)) - return; - - rte_smp_wmb(); - - log_base = (unsigned long *)(uintptr_t)dev->log_base; - - for (i = 0; i < vq->log_cache_nb_elem; i++) { - struct log_cache_entry *elem = vq->log_cache + i; - -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) - /* - * '__sync' builtins are deprecated, but '__atomic' ones - * are sub-optimized in older GCC versions. - */ - __sync_fetch_and_or(log_base + elem->offset, elem->val); -#else - __atomic_fetch_or(log_base + elem->offset, elem->val, - __ATOMIC_RELAXED); -#endif - } - - rte_smp_wmb(); - - vq->log_cache_nb_elem = 0; -} - -static __rte_always_inline void -vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t page) -{ - uint32_t bit_nr = page % (sizeof(unsigned long) << 3); - uint32_t offset = page / (sizeof(unsigned long) << 3); - int i; - - for (i = 0; i < vq->log_cache_nb_elem; i++) { - struct log_cache_entry *elem = vq->log_cache + i; - - if (elem->offset == offset) { - elem->val |= (1UL << bit_nr); - return; - } - } - - if (unlikely(i >= VHOST_LOG_CACHE_NR)) { - /* - * No more room for a new log cache entry, - * so write the dirty log map directly. - */ - rte_smp_wmb(); - vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); - - return; - } - - vq->log_cache[i].offset = offset; - vq->log_cache[i].val = (1UL << bit_nr); - vq->log_cache_nb_elem++; + if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) + __vhost_log_cache_sync(dev, vq); } static __rte_always_inline void vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len) { - uint64_t page; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base || !len)) - return; - - if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) - return; - - page = addr / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < addr + len) { - vhost_log_cache_page(dev, vq, page); - page += 1; - } + if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) + __vhost_log_cache_write(dev, vq, addr, len); } static __rte_always_inline void -- 2.21.0
From: Maxime Coquelin <maxime.coquelin@redhat.com> [ upstream commit 5a5f6e78b24881941a75d6b9f1214f0ee0e2f9aa] At runtime either packed Tx/Rx functions will always be called, or split Tx/Rx functions will always be called. This patch removes the forced inlining in order to reduce the I-cache pressure. Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Tiwei Bie <tiwei.bie@intel.com> --- lib/librte_vhost/virtio_net.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 6ae617698..26601c2d4 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -811,7 +811,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } -static __rte_always_inline uint32_t +static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { @@ -869,7 +869,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_always_inline uint32_t +static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { @@ -1339,7 +1339,7 @@ get_zmbuf(struct vhost_virtqueue *vq) return NULL; } -static __rte_always_inline uint16_t +static __rte_noinline uint16_t virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) { @@ -1461,7 +1461,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } -static __rte_always_inline uint16_t +static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) { -- 2.21.0
From: Maxime Coquelin <maxime.coquelin@redhat.com> [ upstream commit 084fac96ca0331f2646002c302f389415fef69e2 ] Handling of fragmented virtio-net header and indirect descriptors tables was implemented to fix CVE-2018-1059. It should never happen with healthy guests and so is already considered as unlikely code path. This patch moves these bits into non-inline dedicated functions to reduce the I-cache pressure. Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Tiwei Bie <tiwei.bie@intel.com> Conflicts: lib/librte_vhost/vdpa.c: Due to not having [1]. Skiping changes on non existing function lib/librte_vhost/vhost.h: Due to not having [1]. alloc_copy_ind_table() is still in virtio_net.c lib/librte_vhost/virtio_net.c: Due to not having [1]. Remove alloc_copy_ind_table() [1] b13ad2dec vhost: provide helpers for virtio ring relay --- lib/librte_vhost/vhost.c | 33 +++++++++ lib/librte_vhost/vhost.h | 3 + lib/librte_vhost/virtio_net.c | 133 ++++++++++++++-------------------- 3 files changed, 92 insertions(+), 77 deletions(-) diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index fb21e1ddd..1b62fde68 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -200,6 +200,39 @@ __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +void * +vhost_alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t desc_addr, uint64_t desc_len) +{ + void *idesc; + uint64_t src, dst; + uint64_t len, remain = desc_len; + + idesc = rte_malloc(__func__, desc_len, 0); + if (unlikely(!idesc)) + return NULL; + + dst = (uint64_t)(uintptr_t)idesc; + + while (remain) { + len = remain; + src = vhost_iova_to_vva(dev, vq, desc_addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + rte_free(idesc); + return NULL; + } + + rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + desc_addr += len; + } + + return idesc; +} + void cleanup_vq(struct vhost_virtqueue *vq, int destroy) { diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index a4f69f98a..5a857b155 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -558,6 +558,9 @@ void vhost_backend_cleanup(struct virtio_net *dev); uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t iova, uint64_t *len, uint8_t perm); +void *vhost_alloc_copy_ind_table(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t desc_addr, uint64_t desc_len); int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq); void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 26601c2d4..0a0b44c8b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -37,39 +37,6 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; } -static __rte_always_inline void * -alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t desc_addr, uint64_t desc_len) -{ - void *idesc; - uint64_t src, dst; - uint64_t len, remain = desc_len; - - idesc = rte_malloc(__func__, desc_len, 0); - if (unlikely(!idesc)) - return 0; - - dst = (uint64_t)(uintptr_t)idesc; - - while (remain) { - len = remain; - src = vhost_iova_to_vva(dev, vq, desc_addr, &len, - VHOST_ACCESS_RO); - if (unlikely(!src || !len)) { - rte_free(idesc); - return 0; - } - - rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); - - remain -= len; - dst += len; - desc_addr += len; - } - - return idesc; -} - static __rte_always_inline void free_ind_table(void *idesc) { @@ -377,7 +344,7 @@ fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, * The indirect desc table is not contiguous * in process VA space, we have to copy it. */ - idesc = alloc_copy_ind_table(dev, vq, + idesc = vhost_alloc_copy_ind_table(dev, vq, vq->desc[idx].addr, vq->desc[idx].len); if (unlikely(!idesc)) return -1; @@ -494,7 +461,8 @@ fill_vec_buf_packed_indirect(struct virtio_net *dev, * The indirect desc table is not contiguous * in process VA space, we have to copy it. */ - idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len); + idescs = vhost_alloc_copy_ind_table(dev, + vq, desc->addr, desc->len); if (unlikely(!idescs)) return -1; @@ -650,6 +618,36 @@ reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +static __rte_noinline void +copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct buf_vector *buf_vec, + struct virtio_net_hdr_mrg_rxbuf *hdr) +{ + uint64_t len; + uint64_t remain = dev->vhost_hlen; + uint64_t src = (uint64_t)(uintptr_t)hdr, dst; + uint64_t iova = buf_vec->buf_iova; + + while (remain) { + len = RTE_MIN(remain, + buf_vec->buf_len); + dst = buf_vec->buf_addr; + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, + len); + + PRINT_PACKET(dev, (uintptr_t)dst, + (uint32_t)len, 0); + vhost_log_cache_write(dev, vq, + iova, len); + + remain -= len; + iova += len; + src += len; + buf_vec++; + } +} + static __rte_always_inline int copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *m, struct buf_vector *buf_vec, @@ -743,30 +741,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, num_buffers); if (unlikely(hdr == &tmp_hdr)) { - uint64_t len; - uint64_t remain = dev->vhost_hlen; - uint64_t src = (uint64_t)(uintptr_t)hdr, dst; - uint64_t iova = buf_vec[0].buf_iova; - uint16_t hdr_vec_idx = 0; - - while (remain) { - len = RTE_MIN(remain, - buf_vec[hdr_vec_idx].buf_len); - dst = buf_vec[hdr_vec_idx].buf_addr; - rte_memcpy((void *)(uintptr_t)dst, - (void *)(uintptr_t)src, - len); - - PRINT_PACKET(dev, (uintptr_t)dst, - (uint32_t)len, 0); - vhost_log_cache_write(dev, vq, - iova, len); - - remain -= len; - iova += len; - src += len; - hdr_vec_idx++; - } + copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr); } else { PRINT_PACKET(dev, (uintptr_t)hdr_addr, dev->vhost_hlen, 0); @@ -1102,6 +1077,27 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) } } +static __rte_noinline void +copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr, + struct buf_vector *buf_vec) +{ + uint64_t len; + uint64_t remain = sizeof(struct virtio_net_hdr); + uint64_t src; + uint64_t dst = (uint64_t)(uintptr_t)hdr; + + while (remain) { + len = RTE_MIN(remain, buf_vec->buf_len); + src = buf_vec->buf_addr; + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + buf_vec++; + } +} + static __rte_always_inline int copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, uint16_t nr_vec, @@ -1133,28 +1129,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, if (virtio_net_with_host_offload(dev)) { if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) { - uint64_t len; - uint64_t remain = sizeof(struct virtio_net_hdr); - uint64_t src; - uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr; - uint16_t hdr_vec_idx = 0; - /* * No luck, the virtio-net header doesn't fit * in a contiguous virtual area. */ - while (remain) { - len = RTE_MIN(remain, - buf_vec[hdr_vec_idx].buf_len); - src = buf_vec[hdr_vec_idx].buf_addr; - rte_memcpy((void *)(uintptr_t)dst, - (void *)(uintptr_t)src, len); - - remain -= len; - dst += len; - hdr_vec_idx++; - } - + copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec); hdr = &tmp_hdr; } else { hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr); -- 2.21.0
From: Maxime Coquelin <maxime.coquelin@redhat.com> [ upstream commit d1134c09e37608c949b75ebfc7ea01000b134c5c ] Now that we have a single function to map the descriptors buffers, let's prefetch them there as it is the earliest place we can do it. Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Tiwei Bie <tiwei.bie@intel.com> --- lib/librte_vhost/virtio_net.c | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 0a0b44c8b..83046cc6b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -293,6 +293,8 @@ map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(!desc_addr)) return -1; + rte_prefetch0((void *)(uintptr_t)desc_addr); + buf_vec[vec_id].buf_iova = desc_iova; buf_vec[vec_id].buf_addr = desc_addr; buf_vec[vec_id].buf_len = desc_chunck_len; @@ -673,9 +675,6 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, buf_iova = buf_vec[vec_idx].buf_iova; buf_len = buf_vec[vec_idx].buf_len; - if (nr_vec > 1) - rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); - if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { error = -1; goto out; @@ -718,10 +717,6 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, buf_iova = buf_vec[vec_idx].buf_iova; buf_len = buf_vec[vec_idx].buf_len; - /* Prefetch next buffer address. */ - if (vec_idx + 1 < nr_vec) - rte_prefetch0((void *)(uintptr_t) - buf_vec[vec_idx + 1].buf_addr); buf_offset = 0; buf_avail = buf_len; } @@ -818,8 +813,6 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers); @@ -867,8 +860,6 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers); @@ -1124,9 +1115,6 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } - if (likely(nr_vec > 1)) - rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); - if (virtio_net_with_host_offload(dev)) { if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) { /* @@ -1137,7 +1125,6 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, hdr = &tmp_hdr; } else { hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr); - rte_prefetch0(hdr); } } @@ -1167,9 +1154,6 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; } - rte_prefetch0((void *)(uintptr_t) - (buf_addr + buf_offset)); - PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset), (uint32_t)buf_avail, 0); @@ -1235,14 +1219,6 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, buf_iova = buf_vec[vec_idx].buf_iova; buf_len = buf_vec[vec_idx].buf_len; - /* - * Prefecth desc n + 1 buffer while - * desc n buffer is processed. - */ - if (vec_idx + 1 < nr_vec) - rte_prefetch0((void *)(uintptr_t) - buf_vec[vec_idx + 1].buf_addr); - buf_offset = 0; buf_avail = buf_len; @@ -1386,8 +1362,6 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, if (likely(dev->dequeue_zero_copy == 0)) update_shadow_used_ring_split(vq, head_idx, 0); - rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { RTE_LOG(ERR, VHOST_DATA, @@ -1499,8 +1473,6 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, update_shadow_used_ring_packed(vq, buf_id, 0, desc_count); - rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { RTE_LOG(ERR, VHOST_DATA, -- 2.21.0
From: Maxime Coquelin <maxime.coquelin@redhat.com> [ upstream commit 1f4d55be438b428bed74f2e3dc49cfd6efc3e6fd ] Some helpers in the header file are forced inlined other are only inlined, this patch forces inline for all. It will avoid it to be embedded as functions when called multiple times in the same object file. For example, when we added packed ring support in vhost-user library, rte_memcpy_generic got no more inlined. Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- .../common/include/arch/x86/rte_memcpy.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index 7b758094d..ba44c4a32 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -115,7 +115,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src) * Copy 128-byte blocks from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { __m512i zmm0, zmm1; @@ -163,7 +163,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) } } -static inline void * +static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; @@ -330,7 +330,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); @@ -343,7 +343,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src) * Copy 128-byte blocks from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { __m256i ymm0, ymm1, ymm2, ymm3; @@ -363,7 +363,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) } } -static inline void * +static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; @@ -523,7 +523,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); @@ -655,7 +655,7 @@ __extension__ ({ \ } \ }) -static inline void * +static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; @@ -800,7 +800,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #endif /* RTE_MACHINE_CPUFLAG */ -static inline void * +static __rte_always_inline void * rte_memcpy_aligned(void *dst, const void *src, size_t n) { void *ret = dst; @@ -860,7 +860,7 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) return ret; } -static inline void * +static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n) { if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) -- 2.21.0
[ upstream commit 1fc3b3f06aa9c79c749e8587859d75d237ba9161 ] Add IOVA versions of dirty page logging functions. Note that the API facing rte_vhost_log_write is not modified. So, make explicit that it expects the address in GPA space. Fixes: 69c90e98f483 ("vhost: enable IOMMU support") Cc: stable@dpdk.org Signed-off-by: Adrian Moreno <amorenoz@redhat.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Conflicts: lib/librte_vhost/vdpa.c: Due to not having [1] [1] b13ad2dec vhost: provide helpers for virtio ring relay --- lib/librte_vhost/rte_vhost.h | 2 +- lib/librte_vhost/vhost.c | 40 +++++++++++++++++++++++++++++++++++ lib/librte_vhost/vhost.h | 31 +++++++++++++++++++++++++++ lib/librte_vhost/virtio_net.c | 12 ++++++----- 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index ae96ed715..ce1f12e1d 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -225,7 +225,7 @@ rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, * @param vid * vhost device ID * @param addr - * the starting address for write + * the starting address for write (in guest physical address space) * @param len * the length to write */ diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 1b62fde68..f00efb382 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -115,6 +115,26 @@ __vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) } } +void +__vhost_log_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t len) +{ + uint64_t hva, gpa, map_len; + map_len = len; + + hva = __vhost_iova_to_vva(dev, vq, iova, &map_len, VHOST_ACCESS_RW); + if (map_len != len) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to write log for IOVA 0x%" PRIx64 ". No IOTLB entry found\n", + iova); + return; + } + + gpa = hva_to_gpa(dev, hva, len); + if (gpa) + __vhost_log_write(dev, gpa, len); +} + void __vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -200,6 +220,26 @@ __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +void +__vhost_log_cache_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t len) +{ + uint64_t hva, gpa, map_len; + map_len = len; + + hva = __vhost_iova_to_vva(dev, vq, iova, &map_len, VHOST_ACCESS_RW); + if (map_len != len) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to write log for IOVA 0x%" PRIx64 ". No IOTLB entry found\n", + iova); + return; + } + + gpa = hva_to_gpa(dev, hva, len); + if (gpa) + __vhost_log_cache_write(dev, vq, gpa, len); +} + void * vhost_alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t desc_addr, uint64_t desc_len) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 5a857b155..4279db95a 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -402,9 +402,14 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) void __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len); +void __vhost_log_cache_write_iova(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t iova, uint64_t len); void __vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq); void __vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len); +void __vhost_log_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t len); static __rte_always_inline void vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) @@ -442,6 +447,32 @@ vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, vhost_log_write(dev, vq->log_guest_addr + offset, len); } +static __rte_always_inline void +vhost_log_cache_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t len) +{ + if (likely(!(dev->features & (1ULL << VHOST_F_LOG_ALL)))) + return; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + __vhost_log_cache_write_iova(dev, vq, iova, len); + else + __vhost_log_cache_write(dev, vq, iova, len); +} + +static __rte_always_inline void +vhost_log_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t len) +{ + if (likely(!(dev->features & (1ULL << VHOST_F_LOG_ALL)))) + return; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + __vhost_log_write_iova(dev, vq, iova, len); + else + __vhost_log_write(dev, iova, len); +} + /* Macros for printing using RTE_LOG */ #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 #define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 83046cc6b..ebeec8fd1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -184,7 +184,8 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) for (i = 0; i < count; i++) { rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); - vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len); + vhost_log_cache_write_iova(dev, vq, elem[i].log_addr, + elem[i].len); PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); } @@ -640,7 +641,7 @@ copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0); - vhost_log_cache_write(dev, vq, + vhost_log_cache_write_iova(dev, vq, iova, len); remain -= len; @@ -740,7 +741,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, } else { PRINT_PACKET(dev, (uintptr_t)hdr_addr, dev->vhost_hlen, 0); - vhost_log_cache_write(dev, vq, + vhost_log_cache_write_iova(dev, vq, buf_vec[0].buf_iova, dev->vhost_hlen); } @@ -755,8 +756,9 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); - vhost_log_cache_write(dev, vq, buf_iova + buf_offset, - cpy_len); + vhost_log_cache_write_iova(dev, vq, + buf_iova + buf_offset, + cpy_len); PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset), cpy_len, 0); } else { -- 2.21.0
On 17/12/2019 18:06, Adrian Moreno wrote:
> Backport the entire series [0] 'vhost: I-cache pressure optimizations'
> because it is motivated by a performance degradation from v17.11.
>
> Also pick [1] to allow for a clean backport of the GPA logging support.
>
> The result is an almost clean backport. Only the lack of [2] generates
> conflicts. But that is part of a feature [3] that, IMHO, does not meet the
> requirements to be backported to the stable branch.
>
> [0] https://patches.dpdk.org/cover/53807/
> [1] b473ec113 vhost: batch used descs chains write-back with packed ring
> [2] b13ad2dec vhost: provide helpers for virtio ring relay
> [3] https://patches.dpdk.org/cover/48747/
>
>
> Adrian Moreno (1):
> vhost: convert buffer addresses to GPA for logging
>
> Maxime Coquelin (6):
> vhost: batch used descs chains write-back with packed ring
> vhost: un-inline dirty pages logging functions
> vhost: do not inline packed and split functions
> vhost: do not inline unlikely fragmented buffers code
> vhost: simplify descriptor buffer prefetching
> eal/x86: force inlining of all memcpy and mov helpers
>
> .../common/include/arch/x86/rte_memcpy.h | 18 +-
> lib/librte_vhost/rte_vhost.h | 2 +-
> lib/librte_vhost/vhost.c | 204 ++++++++++++++++++
> lib/librte_vhost/vhost.h | 164 ++++----------
> lib/librte_vhost/virtio_net.c | 202 ++++++++---------
> 5 files changed, 343 insertions(+), 247 deletions(-)
>
Thanks Adrian, applied to 18.11 branch.