From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga07.intel.com (mga07.intel.com [134.134.136.100]) by dpdk.org (Postfix) with ESMTP id 849522E8B for ; Fri, 8 Sep 2017 14:50:43 +0200 (CEST) Received: from fmsmga004.fm.intel.com ([10.253.24.48]) by orsmga105.jf.intel.com with ESMTP; 08 Sep 2017 05:50:42 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.42,361,1500966000"; d="scan'208";a="309468443" Received: from debian-zgviawfucg.sh.intel.com ([10.67.104.160]) by fmsmga004.fm.intel.com with ESMTP; 08 Sep 2017 05:50:40 -0700 From: Tiwei Bie To: dev@dpdk.org Cc: yliu@fridaylinux.org, maxime.coquelin@redhat.com, Zhihong Wang , Zhiyong Yang Date: Fri, 8 Sep 2017 20:50:46 +0800 Message-Id: <20170908125046.28739-1-tiwei.bie@intel.com> X-Mailer: git-send-email 2.13.2 In-Reply-To: <20170824021939.21306-1-tiwei.bie@intel.com> References: <20170824021939.21306-1-tiwei.bie@intel.com> Subject: [dpdk-dev] [PATCH v2] vhost: adaptively batch small guest memory copies X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 08 Sep 2017 12:50:44 -0000 This patch adaptively batches the small guest memory copies. By batching the small copies, the efficiency of executing the memory LOAD instructions can be improved greatly, because the memory LOAD latency can be effectively hidden by the pipeline. We saw great performance boosts for small packets PVP test. This patch improves the performance for small packets, and has distinguished the packets by size. So although the performance for big packets doesn't change, it makes it relatively easy to do some special optimizations for the big packets too. Signed-off-by: Tiwei Bie Signed-off-by: Zhihong Wang Signed-off-by: Zhiyong Yang Reviewed-by: Maxime Coquelin --- This optimization depends on the CPU internal pipeline design. So further tests (e.g. ARM) from the community is appreciated. v2: s/burst_copy/batch_copy/g lib/librte_vhost/vhost.c | 2 +- lib/librte_vhost/vhost.h | 13 +++ lib/librte_vhost/vhost_user.c | 12 +++ lib/librte_vhost/virtio_net.c | 240 ++++++++++++++++++++++++++++++++---------- 4 files changed, 209 insertions(+), 58 deletions(-) diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 0b6aa1c..474b6e4 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -102,7 +102,7 @@ free_device(struct virtio_net *dev) vq = dev->virtqueue[i]; rte_free(vq->shadow_used_ring); - + rte_free(vq->batch_copy_elems); rte_free(vq); } diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 0f294f3..78fd895 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -81,6 +81,16 @@ struct zcopy_mbuf { }; TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); +/* + * Structure contains the info for each batched memory copy. + */ +struct batch_copy_elem { + void *dst; + void *src; + uint32_t len; + uint64_t log_addr; +}; + /** * Structure contains variables relevant to RX/TX virtqueues. */ @@ -114,6 +124,9 @@ struct vhost_virtqueue { struct vring_used_elem *shadow_used_ring; uint16_t shadow_used_idx; + + struct batch_copy_elem *batch_copy_elems; + uint16_t batch_copy_nb_elems; } __rte_cache_aligned; /* Old kernels have no such macros defined */ diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index ad2e8d3..b62e382 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -230,6 +230,15 @@ vhost_user_set_vring_num(struct virtio_net *dev, return -1; } + vq->batch_copy_elems = rte_malloc(NULL, + vq->size * sizeof(struct batch_copy_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->batch_copy_elems) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for batching copy.\n"); + return -1; + } + return 0; } @@ -741,6 +750,9 @@ vhost_user_get_vring_base(struct virtio_net *dev, rte_free(vq->shadow_used_ring); vq->shadow_used_ring = NULL; + rte_free(vq->batch_copy_elems); + vq->batch_copy_elems = NULL; + return 0; } diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index a5f0eeb..f8732df 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -49,6 +49,8 @@ #define MAX_PKT_BURST 32 +#define MAX_BATCH_LEN 256 + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -105,6 +107,31 @@ update_shadow_used_ring(struct vhost_virtqueue *vq, vq->shadow_used_ring[i].len = len; } +static inline void +do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + struct batch_copy_elem *elem = vq->batch_copy_elems; + uint16_t count = vq->batch_copy_nb_elems; + int i; + + for (i = 0; i < count; i++) { + rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + vhost_log_write(dev, elem[i].log_addr, elem[i].len); + PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); + } +} + +static inline void +do_data_copy_dequeue(struct vhost_virtqueue *vq) +{ + struct batch_copy_elem *elem = vq->batch_copy_elems; + uint16_t count = vq->batch_copy_nb_elems; + int i; + + for (i = 0; i < count; i++) + rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -168,8 +195,9 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) } static __rte_always_inline int -copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, - struct rte_mbuf *m, uint16_t desc_idx, uint32_t size) +copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct vring_desc *descs, struct rte_mbuf *m, + uint16_t desc_idx, uint32_t size) { uint32_t desc_avail, desc_offset; uint32_t mbuf_avail, mbuf_offset; @@ -178,6 +206,9 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, uint64_t desc_addr; /* A counter to avoid desc dead loop chain */ uint16_t nr_desc = 1; + struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + uint16_t copy_nb = vq->batch_copy_nb_elems; + int error = 0; desc = &descs[desc_idx]; desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); @@ -186,8 +217,10 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, * performance issue with some versions of gcc (4.8.4 and 5.3.0) which * otherwise stores offset on the stack instead of in a register. */ - if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) - return -1; + if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) { + error = -1; + goto out; + } rte_prefetch0((void *)(uintptr_t)desc_addr); @@ -213,27 +246,43 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, if (desc_avail == 0) { if ((desc->flags & VRING_DESC_F_NEXT) == 0) { /* Room in vring buffer is not enough */ - return -1; + error = -1; + goto out; + } + if (unlikely(desc->next >= size || ++nr_desc > size)) { + error = -1; + goto out; } - if (unlikely(desc->next >= size || ++nr_desc > size)) - return -1; desc = &descs[desc->next]; desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); - if (unlikely(!desc_addr)) - return -1; + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } desc_offset = 0; desc_avail = desc->len; } cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, desc->addr + desc_offset, cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); + if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { + rte_memcpy((void *)((uintptr_t)(desc_addr + + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, desc->addr + desc_offset, cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + } else { + batch_copy[copy_nb].dst = + (void *)((uintptr_t)(desc_addr + desc_offset)); + batch_copy[copy_nb].src = + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); + batch_copy[copy_nb].log_addr = desc->addr + desc_offset; + batch_copy[copy_nb].len = cpy_len; + copy_nb++; + } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; @@ -241,7 +290,10 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, desc_offset += cpy_len; } - return 0; +out: + vq->batch_copy_nb_elems = copy_nb; + + return error; } /** @@ -284,6 +336,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", dev->vid, start_idx, start_idx + count); + vq->batch_copy_nb_elems = 0; + /* Retrieve all of the desc indexes first to avoid caching issues. */ rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); for (i = 0; i < count; i++) { @@ -318,7 +372,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, sz = vq->size; } - err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz); + err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz); if (unlikely(err)) { used_idx = (start_idx + i) & (vq->size - 1); vq->used->ring[used_idx].len = dev->vhost_hlen; @@ -331,6 +385,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, rte_prefetch0(&vq->desc[desc_indexes[i+1]]); } + do_data_copy_enqueue(dev, vq); + rte_smp_wmb(); *(volatile uint16_t *)&vq->used->idx += count; @@ -439,8 +495,9 @@ reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, } static __rte_always_inline int -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, - struct buf_vector *buf_vec, uint16_t num_buffers) +copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *m, struct buf_vector *buf_vec, + uint16_t num_buffers) { uint32_t vec_idx = 0; uint64_t desc_addr; @@ -449,13 +506,20 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, uint32_t cpy_len; uint64_t hdr_addr, hdr_phys_addr; struct rte_mbuf *hdr_mbuf; + struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + uint16_t copy_nb = vq->batch_copy_nb_elems; + int error = 0; - if (unlikely(m == NULL)) - return -1; + if (unlikely(m == NULL)) { + error = -1; + goto out; + } desc_addr = rte_vhost_gpa_to_vva(dev->mem, buf_vec[vec_idx].buf_addr); - if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) - return -1; + if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) { + error = -1; + goto out; + } hdr_mbuf = m; hdr_addr = desc_addr; @@ -476,8 +540,10 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, vec_idx++; desc_addr = rte_vhost_gpa_to_vva(dev->mem, buf_vec[vec_idx].buf_addr); - if (unlikely(!desc_addr)) - return -1; + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } /* Prefetch buffer address. */ rte_prefetch0((void *)(uintptr_t)desc_addr); @@ -509,13 +575,27 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, } cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, - cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); + + if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { + rte_memcpy((void *)((uintptr_t)(desc_addr + + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, + buf_vec[vec_idx].buf_addr + desc_offset, + cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + } else { + batch_copy[copy_nb].dst = + (void *)((uintptr_t)(desc_addr + desc_offset)); + batch_copy[copy_nb].src = + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); + batch_copy[copy_nb].log_addr = + buf_vec[vec_idx].buf_addr + desc_offset; + batch_copy[copy_nb].len = cpy_len; + copy_nb++; + } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; @@ -523,7 +603,10 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, desc_offset += cpy_len; } - return 0; +out: + vq->batch_copy_nb_elems = copy_nb; + + return error; } static __rte_always_inline uint32_t @@ -551,6 +634,8 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, if (count == 0) return 0; + vq->batch_copy_nb_elems = 0; + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); vq->shadow_used_idx = 0; @@ -572,7 +657,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers); - if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx], + if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx], buf_vec, num_buffers) < 0) { vq->shadow_used_idx -= num_buffers; break; @@ -581,6 +666,8 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, vq->last_avail_idx += num_buffers; } + do_data_copy_enqueue(dev, vq); + if (likely(vq->shadow_used_idx)) { flush_shadow_used_ring(dev, vq); @@ -766,8 +853,9 @@ put_zmbuf(struct zcopy_mbuf *zmbuf) } static __rte_always_inline int -copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, - uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx, +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct vring_desc *descs, uint16_t max_desc, + struct rte_mbuf *m, uint16_t desc_idx, struct rte_mempool *mbuf_pool) { struct vring_desc *desc; @@ -779,15 +867,22 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, struct virtio_net_hdr *hdr = NULL; /* A counter to avoid desc dead loop chain */ uint32_t nr_desc = 1; + struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + uint16_t copy_nb = vq->batch_copy_nb_elems; + int error = 0; desc = &descs[desc_idx]; if (unlikely((desc->len < dev->vhost_hlen)) || - (desc->flags & VRING_DESC_F_INDIRECT)) - return -1; + (desc->flags & VRING_DESC_F_INDIRECT)) { + error = -1; + goto out; + } desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); - if (unlikely(!desc_addr)) - return -1; + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } if (virtio_net_with_host_offload(dev)) { hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); @@ -802,12 +897,16 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, if (likely((desc->len == dev->vhost_hlen) && (desc->flags & VRING_DESC_F_NEXT) != 0)) { desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) - return -1; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) { + error = -1; + goto out; + } desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); - if (unlikely(!desc_addr)) - return -1; + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } desc_offset = 0; desc_avail = desc->len; @@ -846,10 +945,23 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, */ mbuf_avail = cpy_len; } else { - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, - mbuf_offset), - (void *)((uintptr_t)(desc_addr + desc_offset)), - cpy_len); + if (likely(cpy_len > MAX_BATCH_LEN || + copy_nb >= vq->size)) { + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(desc_addr + + desc_offset)), + cpy_len); + } else { + batch_copy[copy_nb].dst = + rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset); + batch_copy[copy_nb].src = + (void *)((uintptr_t)(desc_addr + + desc_offset)); + batch_copy[copy_nb].len = cpy_len; + copy_nb++; + } } mbuf_avail -= cpy_len; @@ -863,15 +975,21 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, break; if (unlikely(desc->next >= max_desc || - ++nr_desc > max_desc)) - return -1; + ++nr_desc > max_desc)) { + error = -1; + goto out; + } desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) - return -1; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) { + error = -1; + goto out; + } desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); - if (unlikely(!desc_addr)) - return -1; + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } rte_prefetch0((void *)(uintptr_t)desc_addr); @@ -890,7 +1008,8 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, if (unlikely(cur == NULL)) { RTE_LOG(ERR, VHOST_DATA, "Failed to " "allocate memory for mbuf.\n"); - return -1; + error = -1; + goto out; } if (unlikely(dev->dequeue_zero_copy)) rte_mbuf_refcnt_update(cur, 1); @@ -912,7 +1031,10 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, if (hdr) vhost_dequeue_offload(hdr, m); - return 0; +out: + vq->batch_copy_nb_elems = copy_nb; + + return error; } static __rte_always_inline void @@ -1016,6 +1138,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, if (unlikely(vq->enabled == 0)) return 0; + vq->batch_copy_nb_elems = 0; + if (unlikely(dev->dequeue_zero_copy)) { struct zcopy_mbuf *zmbuf, *next; int nr_updated = 0; @@ -1136,7 +1260,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, break; } - err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool); + err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx, + mbuf_pool); if (unlikely(err)) { rte_pktmbuf_free(pkts[i]); break; @@ -1168,6 +1293,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, vq->last_avail_idx += i; if (likely(dev->dequeue_zero_copy == 0)) { + do_data_copy_dequeue(vq); vq->last_used_idx += i; update_used_idx(dev, vq, i); } -- 2.7.4