From: "Xie, Huawei" <huawei.xie@intel.com>
To: Yuanhan Liu <yuanhan.liu@linux.intel.com>, "dev@dpdk.org" <dev@dpdk.org>
Cc: Victor Kaplansky <vkaplans@redhat.com>,
"Michael S. Tsirkin" <mst@redhat.com>
Subject: Re: [dpdk-dev] [PATCH v2 1/7] vhost: refactor rte_vhost_dequeue_burst
Date: Thu, 3 Mar 2016 16:21:19 +0000 [thread overview]
Message-ID: <C37D651A908B024F974696C65296B57B4C626D94@SHSMSX101.ccr.corp.intel.com> (raw)
In-Reply-To: <1455803352-5518-2-git-send-email-yuanhan.liu@linux.intel.com>
On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> The current rte_vhost_dequeue_burst() implementation is a bit messy
> and logic twisted. And you could see repeat code here and there: it
> invokes rte_pktmbuf_alloc() three times at three different places!
>
> However, rte_vhost_dequeue_burst() acutally does a simple job: copy
> the packet data from vring desc to mbuf. What's tricky here is:
>
> - desc buff could be chained (by desc->next field), so that you need
> fetch next one if current is wholly drained.
>
> - One mbuf could not be big enough to hold all desc buff, hence you
> need to chain the mbuf as well, by the mbuf->next field.
>
> Even though, the logic could be simple. Here is the pseudo code.
>
> while (this_desc_is_not_drained_totally || has_next_desc) {
> if (this_desc_has_drained_totally) {
> this_desc = next_desc();
> }
>
> if (mbuf_has_no_room) {
> mbuf = allocate_a_new_mbuf();
> }
>
> COPY(mbuf, desc);
> }
>
> And this is how I refactored rte_vhost_dequeue_burst.
>
> Note that the old patch does a special handling for skipping virtio
> header. However, that could be simply done by adjusting desc_avail
> and desc_offset var:
>
> desc_avail = desc->len - vq->vhost_hlen;
> desc_offset = vq->vhost_hlen;
>
> This refactor makes the code much more readable (IMO), yet it reduces
> binary code size (nearly 2K).
>
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
>
> v2: - fix potential NULL dereference bug of var "prev" and "head"
> ---
> lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
> 1 file changed, 116 insertions(+), 181 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 5e7e5b1..d5cd0fa 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
> }
> }
>
> +static inline struct rte_mbuf *
> +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> + uint16_t desc_idx, struct rte_mempool *mbuf_pool)
> +{
> + struct vring_desc *desc;
> + uint64_t desc_addr;
> + uint32_t desc_avail, desc_offset;
> + uint32_t mbuf_avail, mbuf_offset;
> + uint32_t cpy_len;
> + struct rte_mbuf *head = NULL;
> + struct rte_mbuf *cur = NULL, *prev = NULL;
> + struct virtio_net_hdr *hdr;
> +
> + desc = &vq->desc[desc_idx];
> + desc_addr = gpa_to_vva(dev, desc->addr);
> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> + /* Retrieve virtio net header */
> + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
> + desc_avail = desc->len - vq->vhost_hlen;
There is a serious bug here, desc->len - vq->vhost_len could overflow.
VM could easily create this case. Let us fix it here.
> + desc_offset = vq->vhost_hlen;
> +
> + mbuf_avail = 0;
> + mbuf_offset = 0;
> + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
> + /* This desc reachs to its end, get the next one */
> + if (desc_avail == 0) {
> + desc = &vq->desc[desc->next];
> +
> + desc_addr = gpa_to_vva(dev, desc->addr);
> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> + desc_offset = 0;
> + desc_avail = desc->len;
> +
> + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
> + }
> +
> + /*
> + * This mbuf reachs to its end, get a new one
> + * to hold more data.
> + */
> + if (mbuf_avail == 0) {
> + cur = rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(!cur)) {
> + RTE_LOG(ERR, VHOST_DATA, "Failed to "
> + "allocate memory for mbuf.\n");
> + if (head)
> + rte_pktmbuf_free(head);
> + return NULL;
> + }
> + if (!head) {
> + head = cur;
> + } else {
> + prev->next = cur;
> + prev->data_len = mbuf_offset;
> + head->nb_segs += 1;
> + }
> + head->pkt_len += mbuf_offset;
> + prev = cur;
> +
> + mbuf_offset = 0;
> + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> + }
> +
> + cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
> + (void *)((uintptr_t)(desc_addr + desc_offset)),
> + cpy_len);
> +
> + mbuf_avail -= cpy_len;
> + mbuf_offset += cpy_len;
> + desc_avail -= cpy_len;
> + desc_offset += cpy_len;
> + }
> +
> + if (prev) {
> + prev->data_len = mbuf_offset;
> + head->pkt_len += mbuf_offset;
> +
> + if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
> + vhost_dequeue_offload(hdr, head);
> + }
> +
> + return head;
> +}
> +
> uint16_t
> rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
> {
> - struct rte_mbuf *m, *prev;
> struct vhost_virtqueue *vq;
> - struct vring_desc *desc;
> - uint64_t vb_addr = 0;
> - uint64_t vb_net_hdr_addr = 0;
> - uint32_t head[MAX_PKT_BURST];
> + uint32_t desc_indexes[MAX_PKT_BURST];
> uint32_t used_idx;
> uint32_t i;
> - uint16_t free_entries, entry_success = 0;
> + uint16_t free_entries;
> uint16_t avail_idx;
> - struct virtio_net_hdr *hdr = NULL;
> + struct rte_mbuf *m;
>
> if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
> RTE_LOG(ERR, VHOST_DATA,
> @@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> return 0;
>
> avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> -
> - /* If there are no available buffers then return. */
> - if (vq->last_used_idx == avail_idx)
> + free_entries = avail_idx - vq->last_used_idx;
> + if (free_entries == 0)
> return 0;
>
> - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
> - dev->device_fh);
> + LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, dev->device_fh);
>
> - /* Prefetch available ring to retrieve head indexes. */
> - rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> + used_idx = vq->last_used_idx & (vq->size -1);
>
> - /*get the number of free entries in the ring*/
> - free_entries = (avail_idx - vq->last_used_idx);
> + /* Prefetch available ring to retrieve head indexes. */
> + rte_prefetch0(&vq->avail->ring[used_idx]);
>
> - free_entries = RTE_MIN(free_entries, count);
> - /* Limit to MAX_PKT_BURST. */
> - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> + count = RTE_MIN(count, MAX_PKT_BURST);
> + count = RTE_MIN(count, free_entries);
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") about to dequeue %u buffers\n",
> + dev->device_fh, count);
>
> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> - dev->device_fh, free_entries);
> /* Retrieve all of the head indexes first to avoid caching issues. */
> - for (i = 0; i < free_entries; i++)
> - head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
> + for (i = 0; i < count; i++) {
> + desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> + (vq->size - 1)];
> + }
>
> /* Prefetch descriptor index. */
> - rte_prefetch0(&vq->desc[head[entry_success]]);
> + rte_prefetch0(&vq->desc[desc_indexes[0]]);
> rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
>
> - while (entry_success < free_entries) {
> - uint32_t vb_avail, vb_offset;
> - uint32_t seg_avail, seg_offset;
> - uint32_t cpy_len;
> - uint32_t seg_num = 0;
> - struct rte_mbuf *cur;
> - uint8_t alloc_err = 0;
> -
> - desc = &vq->desc[head[entry_success]];
> -
> - vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
> - hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
> -
> - /* Discard first buffer as it is the virtio header */
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - desc = &vq->desc[desc->next];
> - vb_offset = 0;
> - vb_avail = desc->len;
> - } else {
> - vb_offset = vq->vhost_hlen;
> - vb_avail = desc->len - vb_offset;
> - }
> -
> - /* Buffer address translation. */
> - vb_addr = gpa_to_vva(dev, desc->addr);
> - /* Prefetch buffer address. */
> - rte_prefetch0((void *)(uintptr_t)vb_addr);
> -
> - used_idx = vq->last_used_idx & (vq->size - 1);
> -
> - if (entry_success < (free_entries - 1)) {
> - /* Prefetch descriptor index. */
> - rte_prefetch0(&vq->desc[head[entry_success+1]]);
> - rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
> - }
> -
> - /* Update used index buffer information. */
> - vq->used->ring[used_idx].id = head[entry_success];
> - vq->used->ring[used_idx].len = 0;
> -
> - /* Allocate an mbuf and populate the structure. */
> - m = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(m == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA,
> - "Failed to allocate memory for mbuf.\n");
> - break;
> - }
> - seg_offset = 0;
> - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
> - cpy_len = RTE_MIN(vb_avail, seg_avail);
> -
> - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> -
> - seg_num++;
> - cur = m;
> - prev = m;
> - while (cpy_len != 0) {
> - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
> - (void *)((uintptr_t)(vb_addr + vb_offset)),
> - cpy_len);
> -
> - seg_offset += cpy_len;
> - vb_offset += cpy_len;
> - vb_avail -= cpy_len;
> - seg_avail -= cpy_len;
> -
> - if (vb_avail != 0) {
> - /*
> - * The segment reachs to its end,
> - * while the virtio buffer in TX vring has
> - * more data to be copied.
> - */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - /* Allocate mbuf and populate the structure. */
> - cur = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(cur == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA, "Failed to "
> - "allocate memory for mbuf.\n");
> - rte_pktmbuf_free(m);
> - alloc_err = 1;
> - break;
> - }
> -
> - seg_num++;
> - prev->next = cur;
> - prev = cur;
> - seg_offset = 0;
> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> - } else {
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - /*
> - * There are more virtio buffers in
> - * same vring entry need to be copied.
> - */
> - if (seg_avail == 0) {
> - /*
> - * The current segment hasn't
> - * room to accomodate more
> - * data.
> - */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - /*
> - * Allocate an mbuf and
> - * populate the structure.
> - */
> - cur = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(cur == NULL)) {
> - RTE_LOG(ERR,
> - VHOST_DATA,
> - "Failed to "
> - "allocate memory "
> - "for mbuf\n");
> - rte_pktmbuf_free(m);
> - alloc_err = 1;
> - break;
> - }
> - seg_num++;
> - prev->next = cur;
> - prev = cur;
> - seg_offset = 0;
> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> - }
> -
> - desc = &vq->desc[desc->next];
> -
> - /* Buffer address translation. */
> - vb_addr = gpa_to_vva(dev, desc->addr);
> - /* Prefetch buffer address. */
> - rte_prefetch0((void *)(uintptr_t)vb_addr);
> - vb_offset = 0;
> - vb_avail = desc->len;
> -
> - PRINT_PACKET(dev, (uintptr_t)vb_addr,
> - desc->len, 0);
> - } else {
> - /* The whole packet completes. */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - vb_avail = 0;
> - }
> - }
> -
> - cpy_len = RTE_MIN(vb_avail, seg_avail);
> - }
> -
> - if (unlikely(alloc_err == 1))
> + for (i = 0; i < count; i++) {
> + m = copy_desc_to_mbuf(dev, vq, desc_indexes[i], mbuf_pool);
> + if (m == NULL)
> break;
> + pkts[i] = m;
>
> - m->nb_segs = seg_num;
> - if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
> - vhost_dequeue_offload(hdr, m);
> -
> - pkts[entry_success] = m;
> - vq->last_used_idx++;
> - entry_success++;
> + used_idx = vq->last_used_idx++ & (vq->size - 1);
> + vq->used->ring[used_idx].id = desc_indexes[i];
> + vq->used->ring[used_idx].len = 0;
> }
>
> rte_compiler_barrier();
> - vq->used->idx += entry_success;
> + vq->used->idx += i;
> +
> /* Kick guest if required. */
> if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> eventfd_write(vq->callfd, (eventfd_t)1);
> - return entry_success;
> +
> + return i;
> }
next prev parent reply other threads:[~2016-03-03 16:21 UTC|newest]
Thread overview: 84+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-12-03 6:06 [dpdk-dev] [PATCH 0/5 for 2.3] vhost rxtx refactor Yuanhan Liu
2015-12-03 6:06 ` [dpdk-dev] [PATCH 1/5] vhost: refactor rte_vhost_dequeue_burst Yuanhan Liu
2015-12-03 7:02 ` Stephen Hemminger
2015-12-03 7:25 ` Yuanhan Liu
2015-12-03 7:03 ` Stephen Hemminger
2015-12-12 6:55 ` Rich Lane
2015-12-14 1:55 ` Yuanhan Liu
2016-01-26 10:30 ` Xie, Huawei
2016-01-27 3:26 ` Yuanhan Liu
2016-01-27 6:12 ` Xie, Huawei
2016-01-27 6:16 ` Yuanhan Liu
2015-12-03 6:06 ` [dpdk-dev] [PATCH 2/5] vhost: refactor virtio_dev_rx Yuanhan Liu
2015-12-11 20:42 ` Rich Lane
2015-12-14 1:47 ` Yuanhan Liu
2016-01-21 13:50 ` Jérôme Jutteau
2016-01-27 3:27 ` Yuanhan Liu
2015-12-03 6:06 ` [dpdk-dev] [PATCH 3/5] vhost: refactor virtio_dev_merge_rx Yuanhan Liu
2015-12-03 6:06 ` [dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy Yuanhan Liu
2016-01-27 2:46 ` Xie, Huawei
2016-01-27 3:22 ` Yuanhan Liu
2016-01-27 5:56 ` Xie, Huawei
2016-01-27 6:02 ` Yuanhan Liu
2016-01-27 6:16 ` Xie, Huawei
2016-01-27 6:35 ` Yuanhan Liu
2015-12-03 6:06 ` [dpdk-dev] [PATCH 5/5] vhost: don't use unlikely for VIRTIO_NET_F_MRG_RXBUF detection Yuanhan Liu
2016-02-17 22:50 ` [dpdk-dev] [PATCH 0/5 for 2.3] vhost rxtx refactor Thomas Monjalon
2016-02-18 4:09 ` Yuanhan Liu
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 0/7] " Yuanhan Liu
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 1/7] vhost: refactor rte_vhost_dequeue_burst Yuanhan Liu
2016-03-03 16:21 ` Xie, Huawei [this message]
2016-03-04 2:21 ` Yuanhan Liu
2016-03-07 2:19 ` Xie, Huawei
2016-03-07 2:44 ` Yuanhan Liu
2016-03-03 16:30 ` Xie, Huawei
2016-03-04 2:17 ` Yuanhan Liu
2016-03-07 2:32 ` Xie, Huawei
2016-03-07 2:48 ` Yuanhan Liu
2016-03-07 2:59 ` Xie, Huawei
2016-03-07 6:14 ` Yuanhan Liu
2016-03-03 17:19 ` Xie, Huawei
2016-03-04 2:11 ` Yuanhan Liu
2016-03-07 2:55 ` Xie, Huawei
2016-03-03 17:40 ` Xie, Huawei
2016-03-04 2:32 ` Yuanhan Liu
2016-03-07 3:02 ` Xie, Huawei
2016-03-07 3:03 ` Xie, Huawei
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 2/7] vhost: refactor virtio_dev_rx Yuanhan Liu
2016-03-07 3:34 ` Xie, Huawei
2016-03-08 12:27 ` Yuanhan Liu
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 3/7] vhost: refactor virtio_dev_merge_rx Yuanhan Liu
2016-03-07 6:22 ` Xie, Huawei
2016-03-07 6:36 ` Yuanhan Liu
2016-03-07 6:38 ` Xie, Huawei
2016-03-07 6:51 ` Yuanhan Liu
2016-03-07 7:03 ` Xie, Huawei
2016-03-07 7:16 ` Xie, Huawei
2016-03-07 8:20 ` Yuanhan Liu
2016-03-07 7:52 ` Xie, Huawei
2016-03-07 8:38 ` Yuanhan Liu
2016-03-07 9:27 ` Xie, Huawei
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 4/7] vhost: do not use rte_memcpy for virtio_hdr copy Yuanhan Liu
2016-03-07 1:20 ` Xie, Huawei
2016-03-07 4:20 ` Stephen Hemminger
2016-03-07 5:24 ` Xie, Huawei
2016-03-07 6:21 ` Yuanhan Liu
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 5/7] vhost: don't use unlikely for VIRTIO_NET_F_MRG_RXBUF detection Yuanhan Liu
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 6/7] vhost: do sanity check for desc->len Yuanhan Liu
2016-02-18 13:49 ` [dpdk-dev] [PATCH v2 7/7] vhost: do sanity check for desc->next Yuanhan Liu
2016-03-07 3:10 ` Xie, Huawei
2016-03-07 6:57 ` Yuanhan Liu
2016-02-29 16:06 ` [dpdk-dev] [PATCH v2 0/7] vhost rxtx refactor Thomas Monjalon
2016-03-01 6:01 ` Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 0/8] vhost rxtx refactor and fixes Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 1/8] vhost: refactor rte_vhost_dequeue_burst Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 2/8] vhost: refactor virtio_dev_rx Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 3/8] vhost: refactor virtio_dev_merge_rx Yuanhan Liu
2016-03-11 16:18 ` Thomas Monjalon
2016-03-14 7:35 ` [dpdk-dev] [PATCH v4 " Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 4/8] vhost: do not use rte_memcpy for virtio_hdr copy Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 5/8] vhost: don't use unlikely for VIRTIO_NET_F_MRG_RXBUF detection Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 6/8] vhost: do sanity check for desc->len Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 7/8] vhost: do sanity check for desc->next against with vq->size Yuanhan Liu
2016-03-10 4:32 ` [dpdk-dev] [PATCH v3 8/8] vhost: avoid dead loop chain Yuanhan Liu
2016-03-14 23:09 ` [dpdk-dev] [PATCH v3 0/8] vhost rxtx refactor and fixes Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=C37D651A908B024F974696C65296B57B4C626D94@SHSMSX101.ccr.corp.intel.com \
--to=huawei.xie@intel.com \
--cc=dev@dpdk.org \
--cc=mst@redhat.com \
--cc=vkaplans@redhat.com \
--cc=yuanhan.liu@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).