From: "Ouyang, Changchun" <changchun.ouyang@intel.com>
To: "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [PATCH] examples/vhost: Support jumbo frame in user space vhost
Date: Thu, 21 Aug 2014 01:24:33 +0000 [thread overview]
Message-ID: <F52918179C57134FAEC9EA62FA2F962511839B27@shsmsx102.ccr.corp.intel.com> (raw)
In-Reply-To: <1408078681-3511-1-git-send-email-changchun.ouyang@intel.com>
Hi all,
Any comments for this patch?
And what's the status for merging it into mainline?
Thanks in advance
Changchun
> -----Original Message-----
> From: Ouyang, Changchun
> Sent: Friday, August 15, 2014 12:58 PM
> To: dev@dpdk.org
> Cc: Cao, Waterman; Ouyang, Changchun
> Subject: [PATCH] examples/vhost: Support jumbo frame in user space vhost
>
> This patch support mergeable RX feature and thus support jumbo frame RX
> and TX in user space vhost(as virtio backend).
>
> On RX, it secures enough room from vring to accommodate one complete
> scattered packet which is received by PMD from physical port, and then copy
> data from mbuf to vring buffer, possibly across a few vring entries and
> descriptors.
>
> On TX, it gets a jumbo frame, possibly described by a few vring descriptors
> which are chained together with the flags of 'NEXT', and then copy them into
> one scattered packet and TX it to physical port through PMD.
>
> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com>
> Acked-by: Huawei Xie <huawei.xie@intel.com>
> ---
> examples/vhost/main.c | 726
> ++++++++++++++++++++++++++++++++++++++++----
> examples/vhost/virtio-net.h | 14 +
> 2 files changed, 687 insertions(+), 53 deletions(-)
>
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c index
> 193aa25..7d9e6a2 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -106,6 +106,8 @@
> #define BURST_RX_WAIT_US 15 /* Defines how long we wait
> between retries on RX */
> #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
>
> +#define JUMBO_FRAME_MAX_SIZE 0x2600
> +
> /* State of virtio device. */
> #define DEVICE_MAC_LEARNING 0
> #define DEVICE_RX 1
> @@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv)
> us_vhost_usage(prgname);
> return -1;
> } else {
> - if (ret)
> + if (ret) {
> +
> vmdq_conf_default.rxmode.jumbo_frame = 1;
> +
> vmdq_conf_default.rxmode.max_rx_pkt_len
> + =
> JUMBO_FRAME_MAX_SIZE;
> VHOST_FEATURES = (1ULL <<
> VIRTIO_NET_F_MRG_RXBUF);
> + }
> }
> }
>
> @@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv)
> return -1;
> }
>
> + if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame
> == 1)) {
> + RTE_LOG(INFO, VHOST_PORT,
> + "Vhost zero copy doesn't support jumbo frame,"
> + "please specify '--mergeable 0' to disable the "
> + "mergeable feature.\n");
> + return -1;
> + }
> +
> return 0;
> }
>
> @@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
> * This function adds buffers to the virtio devices RX virtqueue. Buffers can
> * be received from the physical port or from another virtio device. A packet
> * count is returned to indicate the number of packets that were succesfully
> - * added to the RX queue.
> + * added to the RX queue. This function works when mergeable is disabled.
> */
> static inline uint32_t __attribute__((always_inline)) virtio_dev_rx(struct
> virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) @@ -930,7 +944,6
> @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t
> count)
> uint64_t buff_hdr_addr = 0;
> uint32_t head[MAX_PKT_BURST], packet_len = 0;
> uint32_t head_idx, packet_success = 0;
> - uint32_t mergeable, mrg_count = 0;
> uint32_t retry = 0;
> uint16_t avail_idx, res_cur_idx;
> uint16_t res_base_idx, res_end_idx;
> @@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf
> **pkts, uint32_t count)
> LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev-
> >device_fh);
> vq = dev->virtqueue[VIRTIO_RXQ];
> count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
> +
> /* As many data cores may want access to available buffers, they
> need to be reserved. */
> do {
> res_base_idx = vq->last_used_idx_res; @@ -976,9 +990,6
> @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t
> count)
> /* Prefetch available ring to retrieve indexes. */
> rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
>
> - /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
> - mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
> -
> /* Retrieve all of the head indexes first to avoid caching issues. */
> for (head_idx = 0; head_idx < count; head_idx++)
> head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
> (vq->size - 1)]; @@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net
> *dev, struct rte_mbuf **pkts, uint32_t count)
> /* Prefetch buffer address. */
> rte_prefetch0((void*)(uintptr_t)buff_addr);
>
> - if (mergeable && (mrg_count != 0)) {
> - desc->len = packet_len =
> rte_pktmbuf_data_len(buff);
> - } else {
> - /* Copy virtio_hdr to packet and increment buffer
> address */
> - buff_hdr_addr = buff_addr;
> - packet_len = rte_pktmbuf_data_len(buff) + vq-
> >vhost_hlen;
> + /* Copy virtio_hdr to packet and increment buffer address */
> + buff_hdr_addr = buff_addr;
> + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
>
> - /*
> - * If the descriptors are chained the header and data
> are placed in
> - * separate buffers.
> - */
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - desc->len = vq->vhost_hlen;
> - desc = &vq->desc[desc->next];
> - /* Buffer address translation. */
> - buff_addr = gpa_to_vva(dev, desc->addr);
> - desc->len = rte_pktmbuf_data_len(buff);
> - } else {
> - buff_addr += vq->vhost_hlen;
> - desc->len = packet_len;
> - }
> + /*
> + * If the descriptors are chained the header and data are
> + * placed in separate buffers.
> + */
> + if (desc->flags & VRING_DESC_F_NEXT) {
> + desc->len = vq->vhost_hlen;
> + desc = &vq->desc[desc->next];
> + /* Buffer address translation. */
> + buff_addr = gpa_to_vva(dev, desc->addr);
> + desc->len = rte_pktmbuf_data_len(buff);
> + } else {
> + buff_addr += vq->vhost_hlen;
> + desc->len = packet_len;
> }
>
> - PRINT_PACKET(dev, (uintptr_t)buff_addr,
> rte_pktmbuf_data_len(buff), 0);
> -
> /* Update used ring with desc information */
> vq->used->ring[res_cur_idx & (vq->size - 1)].id =
> head[packet_success];
> vq->used->ring[res_cur_idx & (vq->size - 1)].len =
> packet_len;
>
> /* Copy mbuf data to buffer */
> - rte_memcpy((void *)(uintptr_t)buff_addr, (const
> void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
> + rte_memcpy((void *)(uintptr_t)buff_addr,
> + (const void *)buff->pkt.data,
> + rte_pktmbuf_data_len(buff));
> + PRINT_PACKET(dev, (uintptr_t)buff_addr,
> + rte_pktmbuf_data_len(buff), 0);
>
> res_cur_idx++;
> packet_success++;
>
> - /* If mergeable is disabled then a header is required per
> buffer. */
> - if (!mergeable) {
> - rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
> (const void*)&virtio_hdr, vq->vhost_hlen);
> - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq-
> >vhost_hlen, 1);
> - } else {
> - mrg_count++;
> - /* Merge buffer can only handle so many buffers at a
> time. Tell the guest if this limit is reached. */
> - if ((mrg_count == MAX_MRG_PKT_BURST) ||
> (res_cur_idx == res_end_idx)) {
> - virtio_hdr.num_buffers = mrg_count;
> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX:
> Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
> - rte_memcpy((void
> *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
> - PRINT_PACKET(dev,
> (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
> - mrg_count = 0;
> - }
> - }
> + rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
> + (const void *)&virtio_hdr, vq->vhost_hlen);
> +
> + PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq-
> >vhost_hlen, 1);
> +
> if (res_cur_idx < res_end_idx) {
> /* Prefetch descriptor index. */
> rte_prefetch0(&vq->desc[head[packet_success]]);
> @@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct
> rte_mbuf **pkts, uint32_t count)
> return count;
> }
>
> +static inline uint32_t __attribute__((always_inline))
> +copy_from_mbuf_to_vring(struct virtio_net *dev,
> + uint16_t res_base_idx, uint16_t res_end_idx,
> + struct rte_mbuf *pkt)
> +{
> + uint32_t vec_idx = 0;
> + uint32_t entry_success = 0;
> + struct vhost_virtqueue *vq;
> + /* The virtio_hdr is initialised to 0. */
> + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
> + {0, 0, 0, 0, 0, 0}, 0};
> + uint16_t cur_idx = res_base_idx;
> + uint64_t vb_addr = 0;
> + uint64_t vb_hdr_addr = 0;
> + uint32_t seg_offset = 0;
> + uint32_t vb_offset = 0;
> + uint32_t seg_avail;
> + uint32_t vb_avail;
> + uint32_t cpy_len, entry_len;
> +
> + if (pkt == NULL)
> + return 0;
> +
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
> + "End Index %d\n",
> + dev->device_fh, cur_idx, res_end_idx);
> +
> + /*
> + * Convert from gpa to vva
> + * (guest physical addr -> vhost virtual addr)
> + */
> + vq = dev->virtqueue[VIRTIO_RXQ];
> + vb_addr =
> + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
> + vb_hdr_addr = vb_addr;
> +
> + /* Prefetch buffer address. */
> + rte_prefetch0((void *)(uintptr_t)vb_addr);
> +
> + virtio_hdr.num_buffers = res_end_idx - res_base_idx;
> +
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge
> buffers %d\n",
> + dev->device_fh, virtio_hdr.num_buffers);
> +
> + rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
> + (const void *)&virtio_hdr, vq->vhost_hlen);
> +
> + PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
> +
> + seg_avail = rte_pktmbuf_data_len(pkt);
> + vb_offset = vq->vhost_hlen;
> + vb_avail =
> + vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
> +
> + entry_len = vq->vhost_hlen;
> +
> + if (vb_avail == 0) {
> + uint32_t desc_idx =
> + vq->buf_vec[vec_idx].desc_idx;
> + vq->desc[desc_idx].len = vq->vhost_hlen;
> +
> + if ((vq->desc[desc_idx].flags
> + & VRING_DESC_F_NEXT) == 0) {
> + /* Update used ring with desc information */
> + vq->used->ring[cur_idx & (vq->size - 1)].id
> + = vq->buf_vec[vec_idx].desc_idx;
> + vq->used->ring[cur_idx & (vq->size - 1)].len
> + = entry_len;
> +
> + entry_len = 0;
> + cur_idx++;
> + entry_success++;
> + }
> +
> + vec_idx++;
> + vb_addr =
> + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
> +
> + /* Prefetch buffer address. */
> + rte_prefetch0((void *)(uintptr_t)vb_addr);
> + vb_offset = 0;
> + vb_avail = vq->buf_vec[vec_idx].buf_len;
> + }
> +
> + cpy_len = RTE_MIN(vb_avail, seg_avail);
> +
> + while (cpy_len > 0) {
> + /* Copy mbuf data to vring buffer */
> + rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
> + (const void *)(rte_pktmbuf_mtod(pkt, char*) +
> seg_offset),
> + cpy_len);
> +
> + PRINT_PACKET(dev,
> + (uintptr_t)(vb_addr + vb_offset),
> + cpy_len, 0);
> +
> + seg_offset += cpy_len;
> + vb_offset += cpy_len;
> + seg_avail -= cpy_len;
> + vb_avail -= cpy_len;
> + entry_len += cpy_len;
> +
> + if (seg_avail != 0) {
> + /*
> + * The virtio buffer in this vring
> + * entry reach to its end.
> + * But the segment doesn't complete.
> + */
> + if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
> + VRING_DESC_F_NEXT) == 0) {
> + /* Update used ring with desc information */
> + vq->used->ring[cur_idx & (vq->size - 1)].id
> + = vq->buf_vec[vec_idx].desc_idx;
> + vq->used->ring[cur_idx & (vq->size - 1)].len
> + = entry_len;
> + entry_len = 0;
> + cur_idx++;
> + entry_success++;
> + }
> +
> + vec_idx++;
> + vb_addr = gpa_to_vva(dev,
> + vq->buf_vec[vec_idx].buf_addr);
> + vb_offset = 0;
> + vb_avail = vq->buf_vec[vec_idx].buf_len;
> + cpy_len = RTE_MIN(vb_avail, seg_avail);
> + } else {
> + /*
> + * This current segment complete, need continue to
> + * check if the whole packet complete or not.
> + */
> + pkt = pkt->pkt.next;
> + if (pkt != NULL) {
> + /*
> + * There are more segments.
> + */
> + if (vb_avail == 0) {
> + /*
> + * This current buffer from vring is
> + * used up, need fetch next buffer
> + * from buf_vec.
> + */
> + uint32_t desc_idx =
> + vq-
> >buf_vec[vec_idx].desc_idx;
> + vq->desc[desc_idx].len = vb_offset;
> +
> + if ((vq->desc[desc_idx].flags &
> + VRING_DESC_F_NEXT) == 0) {
> + uint16_t wrapped_idx =
> + cur_idx & (vq->size -
> 1);
> + /*
> + * Update used ring with the
> + * descriptor information
> + */
> + vq->used-
> >ring[wrapped_idx].id
> + = desc_idx;
> + vq->used-
> >ring[wrapped_idx].len
> + = entry_len;
> + entry_success++;
> + entry_len = 0;
> + cur_idx++;
> + }
> +
> + /* Get next buffer from buf_vec. */
> + vec_idx++;
> + vb_addr = gpa_to_vva(dev,
> + vq-
> >buf_vec[vec_idx].buf_addr);
> + vb_avail =
> + vq-
> >buf_vec[vec_idx].buf_len;
> + vb_offset = 0;
> + }
> +
> + seg_offset = 0;
> + seg_avail = rte_pktmbuf_data_len(pkt);
> + cpy_len = RTE_MIN(vb_avail, seg_avail);
> + } else {
> + /*
> + * This whole packet completes.
> + */
> + uint32_t desc_idx =
> + vq->buf_vec[vec_idx].desc_idx;
> + vq->desc[desc_idx].len = vb_offset;
> +
> + while (vq->desc[desc_idx].flags &
> + VRING_DESC_F_NEXT) {
> + desc_idx = vq->desc[desc_idx].next;
> + vq->desc[desc_idx].len = 0;
> + }
> +
> + /* Update used ring with desc information */
> + vq->used->ring[cur_idx & (vq->size - 1)].id
> + = vq->buf_vec[vec_idx].desc_idx;
> + vq->used->ring[cur_idx & (vq->size - 1)].len
> + = entry_len;
> + entry_len = 0;
> + cur_idx++;
> + entry_success++;
> + seg_avail = 0;
> + cpy_len = RTE_MIN(vb_avail, seg_avail);
> + }
> + }
> + }
> +
> + return entry_success;
> +}
> +
> +/*
> + * This function adds buffers to the virtio devices RX virtqueue.
> +Buffers can
> + * be received from the physical port or from another virtio device. A
> +packet
> + * count is returned to indicate the number of packets that were
> +succesfully
> + * added to the RX queue. This function works for mergeable RX.
> + */
> +static inline uint32_t __attribute__((always_inline))
> +virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
> + uint32_t count)
> +{
> + struct vhost_virtqueue *vq;
> + uint32_t pkt_idx = 0, entry_success = 0;
> + uint32_t retry = 0;
> + uint16_t avail_idx, res_cur_idx;
> + uint16_t res_base_idx, res_end_idx;
> + uint8_t success = 0;
> +
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
> + dev->device_fh);
> + vq = dev->virtqueue[VIRTIO_RXQ];
> + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
> +
> + if (count == 0)
> + return 0;
> +
> + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> + uint32_t secure_len = 0;
> + uint16_t need_cnt;
> + uint32_t vec_idx = 0;
> + uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq-
> >vhost_hlen;
> + uint16_t i, id;
> +
> + do {
> + /*
> + * As many data cores may want access to available
> + * buffers, they need to be reserved.
> + */
> + res_base_idx = vq->last_used_idx_res;
> + res_cur_idx = res_base_idx;
> +
> + do {
> + avail_idx = *((volatile uint16_t *)&vq->avail-
> >idx);
> + if (unlikely(res_cur_idx == avail_idx)) {
> + /*
> + * If retry is enabled and the queue is
> + * full then we wait and retry to avoid
> + * packet loss.
> + */
> + if (enable_retry) {
> + uint8_t cont = 0;
> + for (retry = 0; retry <
> burst_rx_retry_num; retry++) {
> +
> rte_delay_us(burst_rx_delay_time);
> + avail_idx =
> + *((volatile
> uint16_t *)&vq->avail->idx);
> + if
> (likely(res_cur_idx != avail_idx)) {
> + cont = 1;
> + break;
> + }
> + }
> + if (cont == 1)
> + continue;
> + }
> +
> + LOG_DEBUG(VHOST_DATA,
> + "(%"PRIu64") Failed "
> + "to get enough desc from "
> + "vring\n",
> + dev->device_fh);
> + return pkt_idx;
> + } else {
> + uint16_t wrapped_idx =
> + (res_cur_idx) & (vq->size - 1);
> + uint32_t idx =
> + vq->avail->ring[wrapped_idx];
> + uint8_t next_desc;
> +
> + do {
> + next_desc = 0;
> + secure_len += vq-
> >desc[idx].len;
> + if (vq->desc[idx].flags &
> +
> VRING_DESC_F_NEXT) {
> + idx = vq-
> >desc[idx].next;
> + next_desc = 1;
> + }
> + } while (next_desc);
> +
> + res_cur_idx++;
> + }
> + } while (pkt_len > secure_len);
> +
> + /* vq->last_used_idx_res is atomically updated. */
> + success = rte_atomic16_cmpset(&vq-
> >last_used_idx_res,
> + res_base_idx,
> + res_cur_idx);
> + } while (success == 0);
> +
> + id = res_base_idx;
> + need_cnt = res_cur_idx - res_base_idx;
> +
> + for (i = 0; i < need_cnt; i++, id++) {
> + uint16_t wrapped_idx = id & (vq->size - 1);
> + uint32_t idx = vq->avail->ring[wrapped_idx];
> + uint8_t next_desc;
> + do {
> + next_desc = 0;
> + vq->buf_vec[vec_idx].buf_addr =
> + vq->desc[idx].addr;
> + vq->buf_vec[vec_idx].buf_len =
> + vq->desc[idx].len;
> + vq->buf_vec[vec_idx].desc_idx = idx;
> + vec_idx++;
> +
> + if (vq->desc[idx].flags &
> VRING_DESC_F_NEXT) {
> + idx = vq->desc[idx].next;
> + next_desc = 1;
> + }
> + } while (next_desc);
> + }
> +
> + res_end_idx = res_cur_idx;
> +
> + entry_success = copy_from_mbuf_to_vring(dev,
> res_base_idx,
> + res_end_idx, pkts[pkt_idx]);
> +
> + rte_compiler_barrier();
> +
> + /*
> + * Wait until it's our turn to add our buffer
> + * to the used ring.
> + */
> + while (unlikely(vq->last_used_idx != res_base_idx))
> + rte_pause();
> +
> + *(volatile uint16_t *)&vq->used->idx += entry_success;
> + vq->last_used_idx = res_end_idx;
> +
> + /* Kick the guest if necessary. */
> + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> + eventfd_write((int)vq->kickfd, 1);
> + }
> +
> + return count;
> +}
> +
> /*
> * Compares a packet destination MAC address to a device MAC address.
> */
> @@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct
> rte_mbuf *m)
> /*drop the packet if the device is marked for
> removal*/
> LOG_DEBUG(VHOST_DATA, "(%"PRIu64")
> Device is marked for removal\n", dev_ll->dev->device_fh);
> } else {
> + uint32_t mergeable =
> + dev_ll->dev->features &
> + (1 << VIRTIO_NET_F_MRG_RXBUF);
> +
> /*send the packet to the local virtio device*/
> - ret = virtio_dev_rx(dev_ll->dev, &m, 1);
> + if (likely(mergeable == 0))
> + ret = virtio_dev_rx(dev_ll->dev, &m,
> 1);
> + else
> + ret = virtio_dev_merge_rx(dev_ll-
> >dev,
> + &m, 1);
> +
> if (enable_stats) {
> rte_atomic64_add(
> &dev_statistics[dev_ll->dev-
> >device_fh].rx_total_atomic,
> @@ -1231,7 +1589,7 @@ virtio_tx_route(struct virtio_net* dev, struct
> rte_mbuf *m, struct rte_mempool *
> struct mbuf_table *tx_q;
> struct vlan_ethhdr *vlan_hdr;
> struct rte_mbuf **m_table;
> - struct rte_mbuf *mbuf;
> + struct rte_mbuf *mbuf, *prev;
> unsigned len, ret, offset = 0;
> const uint16_t lcore_id = rte_lcore_id();
> struct virtio_net_data_ll *dev_ll = ll_root_used; @@ -1284,12
> +1642,14 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m,
> struct rte_mempool *
> /* Allocate an mbuf and populate the structure. */
> mbuf = rte_pktmbuf_alloc(mbuf_pool);
> if (unlikely(mbuf == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for
> mbuf.\n");
> + RTE_LOG(ERR, VHOST_DATA,
> + "Failed to allocate memory for mbuf.\n");
> return;
> }
>
> mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
> - mbuf->pkt.pkt_len = mbuf->pkt.data_len;
> + mbuf->pkt.pkt_len = m->pkt.pkt_len + VLAN_HLEN + offset;
> + mbuf->pkt.nb_segs = m->pkt.nb_segs;
>
> /* Copy ethernet header to mbuf. */
> rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data,
> ETH_HLEN); @@ -1304,6 +1664,29 @@ virtio_tx_route(struct virtio_net* dev,
> struct rte_mbuf *m, struct rte_mempool *
> /* Copy the remaining packet contents to the mbuf. */
> rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
> (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m-
> >pkt.data_len - ETH_HLEN));
> +
> + /* Copy the remaining segments for the whole packet. */
> + prev = mbuf;
> + while (m->pkt.next) {
> + /* Allocate an mbuf and populate the structure. */
> + struct rte_mbuf *next_mbuf =
> rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(next_mbuf == NULL)) {
> + rte_pktmbuf_free(mbuf);
> + RTE_LOG(ERR, VHOST_DATA,
> + "Failed to allocate memory for mbuf.\n");
> + return;
> + }
> +
> + m = m->pkt.next;
> + prev->pkt.next = next_mbuf;
> + prev = next_mbuf;
> + next_mbuf->pkt.data_len = m->pkt.data_len;
> +
> + /* Copy data to next mbuf. */
> + rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
> + rte_pktmbuf_mtod(m, const void *), m-
> >pkt.data_len);
> + }
> +
> tx_q->m_table[len] = mbuf;
> len++;
> if (enable_stats) {
> @@ -1394,6 +1777,7 @@ virtio_dev_tx(struct virtio_net* dev, struct
> rte_mempool *mbuf_pool)
>
> /* Setup dummy mbuf. This is copied to a real mbuf if
> transmitted out the physical port. */
> m.pkt.data_len = desc->len;
> + m.pkt.pkt_len = desc->len;
> m.pkt.data = (void*)(uintptr_t)buff_addr;
>
> PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); @@
> -1420,6 +1804,227 @@ virtio_dev_tx(struct virtio_net* dev, struct
> rte_mempool *mbuf_pool)
> eventfd_write((int)vq->kickfd, 1);
> }
>
> +/* This function works for TX packets with mergeable feature enabled.
> +*/ static inline void __attribute__((always_inline))
> +virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool
> +*mbuf_pool) {
> + struct rte_mbuf *m, *prev;
> + struct vhost_virtqueue *vq;
> + struct vring_desc *desc;
> + uint64_t vb_addr = 0;
> + uint32_t head[MAX_PKT_BURST];
> + uint32_t used_idx;
> + uint32_t i;
> + uint16_t free_entries, entry_success = 0;
> + uint16_t avail_idx;
> + uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
> + + RTE_PKTMBUF_HEADROOM);
> +
> + vq = dev->virtqueue[VIRTIO_TXQ];
> + avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> +
> + /* If there are no available buffers then return. */
> + if (vq->last_used_idx == avail_idx)
> + return;
> +
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
> + dev->device_fh);
> +
> + /* Prefetch available ring to retrieve head indexes. */
> + rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> +
> + /*get the number of free entries in the ring*/
> + free_entries = (avail_idx - vq->last_used_idx);
> +
> + /* Limit to MAX_PKT_BURST. */
> + free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> +
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> + dev->device_fh, free_entries);
> + /* Retrieve all of the head indexes first to avoid caching issues. */
> + for (i = 0; i < free_entries; i++)
> + head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size -
> 1)];
> +
> + /* Prefetch descriptor index. */
> + rte_prefetch0(&vq->desc[head[entry_success]]);
> + rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
> +
> + while (entry_success < free_entries) {
> + uint32_t vb_avail, vb_offset;
> + uint32_t seg_avail, seg_offset;
> + uint32_t cpy_len;
> + uint32_t seg_num = 0;
> + struct rte_mbuf *cur;
> + uint8_t alloc_err = 0;
> +
> + desc = &vq->desc[head[entry_success]];
> +
> + /* Discard first buffer as it is the virtio header */
> + desc = &vq->desc[desc->next];
> +
> + /* Buffer address translation. */
> + vb_addr = gpa_to_vva(dev, desc->addr);
> + /* Prefetch buffer address. */
> + rte_prefetch0((void *)(uintptr_t)vb_addr);
> +
> + used_idx = vq->last_used_idx & (vq->size - 1);
> +
> + if (entry_success < (free_entries - 1)) {
> + /* Prefetch descriptor index. */
> + rte_prefetch0(&vq->desc[head[entry_success+1]]);
> + rte_prefetch0(&vq->used->ring[(used_idx + 1) &
> (vq->size - 1)]);
> + }
> +
> + /* Update used index buffer information. */
> + vq->used->ring[used_idx].id = head[entry_success];
> + vq->used->ring[used_idx].len = 0;
> +
> + vb_offset = 0;
> + vb_avail = desc->len;
> + seg_offset = 0;
> + seg_avail = buf_size;
> + cpy_len = RTE_MIN(vb_avail, seg_avail);
> +
> + PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> +
> + /* Allocate an mbuf and populate the structure. */
> + m = rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(m == NULL)) {
> + RTE_LOG(ERR, VHOST_DATA,
> + "Failed to allocate memory for mbuf.\n");
> + return;
> + }
> +
> + seg_num++;
> + cur = m;
> + prev = m;
> + while (cpy_len != 0) {
> + rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *)
> + seg_offset),
> + (void *)((uintptr_t)(vb_addr + vb_offset)),
> + cpy_len);
> +
> + seg_offset += cpy_len;
> + vb_offset += cpy_len;
> + vb_avail -= cpy_len;
> + seg_avail -= cpy_len;
> +
> + if (vb_avail != 0) {
> + /*
> + * The segment reachs to its end,
> + * while the virtio buffer in TX vring has
> + * more data to be copied.
> + */
> + cur->pkt.data_len = seg_offset;
> + m->pkt.pkt_len += seg_offset;
> + /* Allocate mbuf and populate the structure.
> */
> + cur = rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(cur == NULL)) {
> + RTE_LOG(ERR, VHOST_DATA, "Failed
> to "
> + "allocate memory for
> mbuf.\n");
> + rte_pktmbuf_free(m);
> + alloc_err = 1;
> + break;
> + }
> +
> + seg_num++;
> + prev->pkt.next = cur;
> + prev = cur;
> + seg_offset = 0;
> + seg_avail = buf_size;
> + } else {
> + if (desc->flags & VRING_DESC_F_NEXT) {
> + /*
> + * There are more virtio buffers in
> + * same vring entry need to be copied.
> + */
> + if (seg_avail == 0) {
> + /*
> + * The current segment hasn't
> + * room to accomodate more
> + * data.
> + */
> + cur->pkt.data_len =
> seg_offset;
> + m->pkt.pkt_len +=
> seg_offset;
> + /*
> + * Allocate an mbuf and
> + * populate the structure.
> + */
> + cur =
> rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(cur == NULL)) {
> + RTE_LOG(ERR,
> + VHOST_DATA,
> + "Failed to "
> + "allocate
> memory "
> + "for mbuf\n");
> +
> rte_pktmbuf_free(m);
> + alloc_err = 1;
> + break;
> + }
> + seg_num++;
> + prev->pkt.next = cur;
> + prev = cur;
> + seg_offset = 0;
> + seg_avail = buf_size;
> + }
> +
> + desc = &vq->desc[desc->next];
> +
> + /* Buffer address translation. */
> + vb_addr = gpa_to_vva(dev, desc-
> >addr);
> + /* Prefetch buffer address. */
> + rte_prefetch0((void
> *)(uintptr_t)vb_addr);
> + vb_offset = 0;
> + vb_avail = desc->len;
> +
> + PRINT_PACKET(dev,
> (uintptr_t)vb_addr,
> + desc->len, 0);
> + } else {
> + /* The whole packet completes. */
> + cur->pkt.data_len = seg_offset;
> + m->pkt.pkt_len += seg_offset;
> + vb_avail = 0;
> + }
> + }
> +
> + cpy_len = RTE_MIN(vb_avail, seg_avail);
> + }
> +
> + if (unlikely(alloc_err == 1))
> + break;
> +
> + m->pkt.nb_segs = seg_num;
> +
> + /*
> + * If this is the first received packet we need to learn
> + * the MAC and setup VMDQ
> + */
> + if (dev->ready == DEVICE_MAC_LEARNING) {
> + if (dev->remove || (link_vmdq(dev, m) == -1)) {
> + /*
> + * Discard frame if device is scheduled for
> + * removal or a duplicate MAC address is
> found.
> + */
> + entry_success = free_entries;
> + vq->last_used_idx += entry_success;
> + rte_pktmbuf_free(m);
> + break;
> + }
> + }
> +
> + virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev-
> >device_fh);
> + vq->last_used_idx++;
> + entry_success++;
> + rte_pktmbuf_free(m);
> + }
> +
> + rte_compiler_barrier();
> + vq->used->idx += entry_success;
> + /* Kick guest if required. */
> + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> + eventfd_write((int)vq->kickfd, 1);
> +
> +}
> +
> /*
> * This function is called by each data core. It handles all RX/TX registered
> with the
> * core. For TX the specific lcore linked list is used. For RX, MAC addresses are
> compared @@ -1440,8 +2045,9 @@ switch_worker(__attribute__((unused))
> void *arg)
> const uint16_t lcore_id = rte_lcore_id();
> const uint16_t num_cores = (uint16_t)rte_lcore_count();
> uint16_t rx_count = 0;
> + uint32_t mergeable = 0;
>
> - RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n",
> lcore_id);
> + RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n",
> lcore_id);
> lcore_ll = lcore_info[lcore_id].lcore_ll;
> prev_tsc = 0;
>
> @@ -1497,6 +2103,8 @@ switch_worker(__attribute__((unused)) void *arg)
> while (dev_ll != NULL) {
> /*get virtio device ID*/
> dev = dev_ll->dev;
> + mergeable =
> + dev->features & (1 <<
> VIRTIO_NET_F_MRG_RXBUF);
>
> if (dev->remove) {
> dev_ll = dev_ll->next;
> @@ -1510,7 +2118,15 @@ switch_worker(__attribute__((unused)) void *arg)
> (uint16_t)dev->vmdq_rx_q,
> pkts_burst, MAX_PKT_BURST);
>
> if (rx_count) {
> - ret_count = virtio_dev_rx(dev,
> pkts_burst, rx_count);
> + if (likely(mergeable == 0))
> + ret_count =
> + virtio_dev_rx(dev,
> + pkts_burst, rx_count);
> + else
> + ret_count =
> +
> virtio_dev_merge_rx(dev,
> + pkts_burst, rx_count);
> +
> if (enable_stats) {
> rte_atomic64_add(
> &dev_statistics[dev_ll->dev-
> >device_fh].rx_total_atomic,
> @@ -1520,15 +2136,19 @@ switch_worker(__attribute__((unused)) void
> *arg)
> }
> while (likely(rx_count)) {
> rx_count--;
> -
> rte_pktmbuf_free_seg(pkts_burst[rx_count]);
> +
> rte_pktmbuf_free(pkts_burst[rx_count]);
> }
>
> }
> }
>
> - if (!dev->remove)
> + if (!dev->remove) {
> /*Handle guest TX*/
> - virtio_dev_tx(dev, mbuf_pool);
> + if (likely(mergeable == 0))
> + virtio_dev_tx(dev, mbuf_pool);
> + else
> + virtio_dev_merge_tx(dev,
> mbuf_pool);
> + }
>
> /*move to the next device in the list*/
> dev_ll = dev_ll->next;
> diff --git a/examples/vhost/virtio-net.h b/examples/vhost/virtio-net.h index
> 3d1f255..1a2f0dc 100644
> --- a/examples/vhost/virtio-net.h
> +++ b/examples/vhost/virtio-net.h
> @@ -45,6 +45,18 @@
> /* Enum for virtqueue management. */
> enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
>
> +#define BUF_VECTOR_MAX 256
> +
> +/*
> + * Structure contains buffer address, length and descriptor index
> + * from vring to do scatter RX.
> +*/
> +struct buf_vector {
> +uint64_t buf_addr;
> +uint32_t buf_len;
> +uint32_t desc_idx;
> +};
> +
> /*
> * Structure contains variables relevant to TX/RX virtqueues.
> */
> @@ -60,6 +72,8 @@ struct vhost_virtqueue
> volatile uint16_t last_used_idx_res; /* Used for multiple
> devices reserving buffers. */
> eventfd_t callfd; /*
> Currently unused as polling mode is enabled. */
> eventfd_t kickfd; /*
> Used to notify the guest (trigger interrupt). */
> + /* Used for scatter RX. */
> + struct buf_vector buf_vec[BUF_VECTOR_MAX];
> } __rte_cache_aligned;
>
> /*
> --
> 1.8.4.2
next prev parent reply other threads:[~2014-08-21 1:21 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-08-15 4:58 Ouyang Changchun
2014-08-21 1:24 ` Ouyang, Changchun [this message]
2014-08-25 15:12 ` Thomas Monjalon
2014-09-24 9:10 ` Fu, JingguoX
2014-09-24 9:25 ` Fu, JingguoX
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=F52918179C57134FAEC9EA62FA2F962511839B27@shsmsx102.ccr.corp.intel.com \
--to=changchun.ouyang@intel.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).