From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga03.intel.com (mga03.intel.com [134.134.136.65]) by dpdk.org (Postfix) with ESMTP id 835BB39EA for ; Tue, 23 Aug 2016 10:01:02 +0200 (CEST) Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by orsmga103.jf.intel.com with ESMTP; 23 Aug 2016 01:01:01 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.28,565,1464678000"; d="scan'208";a="752353906" Received: from yliu-dev.sh.intel.com ([10.239.67.162]) by FMSMGA003.fm.intel.com with ESMTP; 23 Aug 2016 01:01:00 -0700 From: Yuanhan Liu To: dev@dpdk.org Cc: Maxime Coquelin , Yuanhan Liu Date: Tue, 23 Aug 2016 16:10:37 +0800 Message-Id: <1471939839-29778-5-git-send-email-yuanhan.liu@linux.intel.com> X-Mailer: git-send-email 1.9.0 In-Reply-To: <1471939839-29778-1-git-send-email-yuanhan.liu@linux.intel.com> References: <1471939839-29778-1-git-send-email-yuanhan.liu@linux.intel.com> Subject: [dpdk-dev] [PATCH 4/6] vhost: add Tx zero copy X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 23 Aug 2016 08:01:04 -0000 The basic idea of Tx zero copy is, instead of copying data from the desc buf, here we let the mbuf reference the desc buf addr directly. Doing so, however, has one major issue: we can't update the used ring at the end of rte_vhost_dequeue_burst. Because we don't do the copy here, an update of the used ring would let the driver to reclaim the desc buf. As a result, DPDK might reference a stale memory region. To update the used ring properly, this patch does several tricks: - when mbuf references a desc buf, refcnt is added by 1. This is to pin lock the mbuf, so that a mbuf free from the DPDK won't actually free it, instead, refcnt is subtracted by 1. - We chain all those mbuf together (by tailq) And we check it every time on the rte_vhost_dequeue_burst entrance, to see if the mbuf is freed (when refcnt equals to 1). If that happens, it means we are the last user of this mbuf and we are safe to update the used ring. - "struct zcopy_mbuf" is introduced, to associate an mbuf with the right desc idx. Tx zero copy is introduced for performance reason, and some rough tests show about 40% perfomance boost for packet size 1400B. FOr small packets, (e.g. 64B), it actually slows a bit down. That is expected because this patch introduces some extra works, and it outweighs the benefit from saving few bytes copy. Signed-off-by: Yuanhan Liu --- lib/librte_vhost/vhost.c | 2 + lib/librte_vhost/vhost.h | 21 ++++++ lib/librte_vhost/vhost_user.c | 41 +++++++++- lib/librte_vhost/virtio_net.c | 169 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 214 insertions(+), 19 deletions(-) diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 46095c3..ab25649 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -141,6 +141,8 @@ init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) /* always set the default vq pair to enabled */ if (qp_idx == 0) vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); } static void diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 8565fa1..718133e 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -61,6 +62,19 @@ struct buf_vector { uint32_t desc_idx; }; +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + /** * Structure contains variables relevant to RX/TX virtqueues. */ @@ -85,6 +99,12 @@ struct vhost_virtqueue { /* Physical address of used ring, for logging */ uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; } __rte_cache_aligned; /* Old kernels have no such macro defined */ @@ -147,6 +167,7 @@ struct virtio_net { uint32_t max_guest_pages; struct guest_page *guest_pages; + int tx_zero_copy; } __rte_cache_aligned; /** diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index 045d4f0..189b57b 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -180,7 +180,22 @@ static int vhost_user_set_vring_num(struct virtio_net *dev, struct vhost_vring_state *state) { - dev->virtqueue[state->index]->size = state->num; + struct vhost_virtqueue *vq = dev->virtqueue[state->index]; + + vq->size = state->num; + + if (dev->tx_zero_copy) { + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size * 2; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->tx_zero_copy = 0; + } + } return 0; } @@ -649,11 +664,32 @@ vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) vq->kickfd = file.fd; if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->tx_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "Tx zero copy is enabled\n"); + } + if (notify_ops->new_device(dev->vid) == 0) dev->flags |= VIRTIO_DEV_RUNNING; } } +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + /* * when virtio is stopped, qemu will send us the GET_VRING_BASE message. */ @@ -682,6 +718,9 @@ vhost_user_get_vring_base(struct virtio_net *dev, dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + if (dev->tx_zero_copy) + free_zmbufs(dev->virtqueue[state->index]); + return 0; } diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1c2ee47..d7e0335 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -678,6 +678,43 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) return 0; } +static inline struct zcopy_mbuf * __attribute__((always_inline)) +get_zmbuf(struct vhost_virtqueue *vq) +{ + uint16_t i; + uint16_t last; + int tries = 0; + + /* search [last_zmbuf_idx, zmbuf_size) */ + i = vq->last_zmbuf_idx; + last = vq->zmbuf_size; + +again: + for (; i < last; i++) { + if (vq->zmbufs[i].in_use == 0) { + vq->last_zmbuf_idx = i + 1; + vq->zmbufs[i].in_use = 1; + return &vq->zmbufs[i]; + } + } + + tries++; + if (tries == 1) { + /* search [0, last_zmbuf_idx) */ + i = 0; + last = vq->last_zmbuf_idx; + goto again; + } + + return NULL; +} + +static inline void __attribute__((always_inline)) +put_zmbuf(struct zcopy_mbuf *zmbuf) +{ + zmbuf->in_use = 0; +} + static inline int __attribute__((always_inline)) copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *m, uint16_t desc_idx, @@ -701,6 +738,27 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(!desc_addr)) return -1; + if (dev->tx_zero_copy) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) + return -1; + zmbuf->mbuf = m; + zmbuf->desc_idx = desc_idx; + + /* + * Pin lock the mbuf; we will check later to see whether + * the mbuf is freed (when we are the last user) or not. + * If that's the case, we then could update the used ring + * safely. + */ + rte_mbuf_refcnt_update(m, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); rte_prefetch0(hdr); @@ -733,9 +791,28 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; while (1) { cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), - (void *)((uintptr_t)(desc_addr + desc_offset)), - cpy_len); + if (dev->tx_zero_copy) { + cur->data_len = cpy_len; + cur->data_off = 0; + cur->buf_addr = (void *)(uintptr_t)desc_addr; + /* + * TODO: we need handle the case a desc buf + * acrosses two pages. + */ + cur->buf_physaddr = gpa_to_hpa(dev, desc->addr + + desc_offset, cpy_len); + + /* + * In zero copy mode, one mbuf can only reference data + * for one or partial of one desc buff. + */ + mbuf_avail = cpy_len; + } else { + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(desc_addr + desc_offset)), + cpy_len); + } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; @@ -796,6 +873,49 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +static inline void __attribute__((always_inline)) +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t used_idx, uint32_t desc_idx) +{ + vq->used->ring[used_idx].id = desc_idx; + vq->used->ring[used_idx].len = 0; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); +} + +static inline void __attribute__((always_inline)) +update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t count) +{ + if (count == 0) + return; + + rte_smp_wmb(); + rte_smp_rmb(); + + vq->used->idx += count; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* Kick guest if required. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); +} + +static inline bool __attribute__((always_inline)) +mbuf_is_consumed(struct rte_mbuf *m) +{ + while (m) { + if (rte_mbuf_refcnt_read(m) > 1) + return false; + m = m->next; + } + + return true; +} + uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) @@ -823,6 +943,30 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, if (unlikely(vq->enabled == 0)) return 0; + if (dev->tx_zero_copy) { + struct zcopy_mbuf *zmbuf, *next; + int nr_updated = 0; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + if (mbuf_is_consumed(zmbuf->mbuf)) { + used_idx = vq->last_used_idx++ & (vq->size - 1); + update_used_ring(dev, vq, used_idx, + zmbuf->desc_idx); + nr_updated += 1; + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + update_used_idx(dev, vq, nr_updated); + } + /* * Construct a RARP broadcast packet, and inject it to the "pkts" * array, to looks like that guest actually send such packet. @@ -870,11 +1014,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, used_idx = (vq->last_used_idx + i) & (vq->size - 1); desc_indexes[i] = vq->avail->ring[avail_idx]; - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = 0; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); + if (dev->tx_zero_copy == 0) + update_used_ring(dev, vq, used_idx, desc_indexes[i]); } /* Prefetch descriptor index. */ @@ -898,19 +1039,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, break; } } - - rte_smp_wmb(); - rte_smp_rmb(); - vq->used->idx += i; vq->last_avail_idx += i; vq->last_used_idx += i; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - /* Kick guest if required. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); + if (dev->tx_zero_copy == 0) + update_used_idx(dev, vq, i); out: if (unlikely(rarp_mbuf != NULL)) { -- 1.9.0