From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga02.intel.com (mga02.intel.com [134.134.136.20]) by dpdk.org (Postfix) with ESMTP id D3E7F590C for ; Thu, 21 Aug 2014 03:21:04 +0200 (CEST) Received: from orsmga002.jf.intel.com ([10.7.209.21]) by orsmga101.jf.intel.com with ESMTP; 20 Aug 2014 18:24:38 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.01,905,1400050800"; d="scan'208";a="591091528" Received: from fmsmsx106.amr.corp.intel.com ([10.18.124.204]) by orsmga002.jf.intel.com with ESMTP; 20 Aug 2014 18:24:37 -0700 Received: from fmsmsx119.amr.corp.intel.com (10.19.9.28) by FMSMSX106.amr.corp.intel.com (10.18.124.204) with Microsoft SMTP Server (TLS) id 14.3.195.1; Wed, 20 Aug 2014 18:24:36 -0700 Received: from shsmsx101.ccr.corp.intel.com (10.239.4.153) by FMSMSX119.amr.corp.intel.com (10.19.9.28) with Microsoft SMTP Server (TLS) id 14.3.195.1; Wed, 20 Aug 2014 18:24:36 -0700 Received: from shsmsx102.ccr.corp.intel.com ([169.254.2.246]) by SHSMSX101.ccr.corp.intel.com ([169.254.1.198]) with mapi id 14.03.0195.001; Thu, 21 Aug 2014 09:24:34 +0800 From: "Ouyang, Changchun" To: "dev@dpdk.org" Thread-Topic: [PATCH] examples/vhost: Support jumbo frame in user space vhost Thread-Index: AQHPuEYKYzO6J5OUP06jvI7JJq/El5vaTJJg Date: Thu, 21 Aug 2014 01:24:33 +0000 Message-ID: References: <1408078681-3511-1-git-send-email-changchun.ouyang@intel.com> In-Reply-To: <1408078681-3511-1-git-send-email-changchun.ouyang@intel.com> Accept-Language: zh-CN, en-US Content-Language: en-US X-MS-Has-Attach: X-MS-TNEF-Correlator: x-originating-ip: [10.239.127.40] Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Subject: Re: [dpdk-dev] [PATCH] examples/vhost: Support jumbo frame in user space vhost X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Aug 2014 01:21:06 -0000 Hi all, Any comments for this patch? And what's the status for merging it into mainline? Thanks in advance Changchun > -----Original Message----- > From: Ouyang, Changchun > Sent: Friday, August 15, 2014 12:58 PM > To: dev@dpdk.org > Cc: Cao, Waterman; Ouyang, Changchun > Subject: [PATCH] examples/vhost: Support jumbo frame in user space vhost >=20 > This patch support mergeable RX feature and thus support jumbo frame RX > and TX in user space vhost(as virtio backend). >=20 > On RX, it secures enough room from vring to accommodate one complete > scattered packet which is received by PMD from physical port, and then co= py > data from mbuf to vring buffer, possibly across a few vring entries and > descriptors. >=20 > On TX, it gets a jumbo frame, possibly described by a few vring descripto= rs > which are chained together with the flags of 'NEXT', and then copy them i= nto > one scattered packet and TX it to physical port through PMD. >=20 > Signed-off-by: Changchun Ouyang > Acked-by: Huawei Xie > --- > examples/vhost/main.c | 726 > ++++++++++++++++++++++++++++++++++++++++---- > examples/vhost/virtio-net.h | 14 + > 2 files changed, 687 insertions(+), 53 deletions(-) >=20 > diff --git a/examples/vhost/main.c b/examples/vhost/main.c index > 193aa25..7d9e6a2 100644 > --- a/examples/vhost/main.c > +++ b/examples/vhost/main.c > @@ -106,6 +106,8 @@ > #define BURST_RX_WAIT_US 15 /* Defines how long we wait > between retries on RX */ > #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ >=20 > +#define JUMBO_FRAME_MAX_SIZE 0x2600 > + > /* State of virtio device. */ > #define DEVICE_MAC_LEARNING 0 > #define DEVICE_RX 1 > @@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv) > us_vhost_usage(prgname); > return -1; > } else { > - if (ret) > + if (ret) { > + > vmdq_conf_default.rxmode.jumbo_frame =3D 1; > + > vmdq_conf_default.rxmode.max_rx_pkt_len > + =3D > JUMBO_FRAME_MAX_SIZE; > VHOST_FEATURES =3D (1ULL << > VIRTIO_NET_F_MRG_RXBUF); > + } > } > } >=20 > @@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv) > return -1; > } >=20 > + if ((zero_copy =3D=3D 1) && (vmdq_conf_default.rxmode.jumbo_frame > =3D=3D 1)) { > + RTE_LOG(INFO, VHOST_PORT, > + "Vhost zero copy doesn't support jumbo frame," > + "please specify '--mergeable 0' to disable the " > + "mergeable feature.\n"); > + return -1; > + } > + > return 0; > } >=20 > @@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa, > * This function adds buffers to the virtio devices RX virtqueue. Buffer= s can > * be received from the physical port or from another virtio device. A p= acket > * count is returned to indicate the number of packets that were succesf= ully > - * added to the RX queue. > + * added to the RX queue. This function works when mergeable is disabled= . > */ > static inline uint32_t __attribute__((always_inline)) virtio_dev_rx(str= uct > virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) @@ -930,7 +944,6 > @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t > count) > uint64_t buff_hdr_addr =3D 0; > uint32_t head[MAX_PKT_BURST], packet_len =3D 0; > uint32_t head_idx, packet_success =3D 0; > - uint32_t mergeable, mrg_count =3D 0; > uint32_t retry =3D 0; > uint16_t avail_idx, res_cur_idx; > uint16_t res_base_idx, res_end_idx; > @@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf > **pkts, uint32_t count) > LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev- > >device_fh); > vq =3D dev->virtqueue[VIRTIO_RXQ]; > count =3D (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; > + > /* As many data cores may want access to available buffers, they > need to be reserved. */ > do { > res_base_idx =3D vq->last_used_idx_res; @@ -976,9 +990,6 > @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t > count) > /* Prefetch available ring to retrieve indexes. */ > rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); >=20 > - /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ > - mergeable =3D dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); > - > /* Retrieve all of the head indexes first to avoid caching issues. */ > for (head_idx =3D 0; head_idx < count; head_idx++) > head[head_idx] =3D vq->avail->ring[(res_cur_idx + head_idx) & > (vq->size - 1)]; @@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net > *dev, struct rte_mbuf **pkts, uint32_t count) > /* Prefetch buffer address. */ > rte_prefetch0((void*)(uintptr_t)buff_addr); >=20 > - if (mergeable && (mrg_count !=3D 0)) { > - desc->len =3D packet_len =3D > rte_pktmbuf_data_len(buff); > - } else { > - /* Copy virtio_hdr to packet and increment buffer > address */ > - buff_hdr_addr =3D buff_addr; > - packet_len =3D rte_pktmbuf_data_len(buff) + vq- > >vhost_hlen; > + /* Copy virtio_hdr to packet and increment buffer address */ > + buff_hdr_addr =3D buff_addr; > + packet_len =3D rte_pktmbuf_data_len(buff) + vq->vhost_hlen; >=20 > - /* > - * If the descriptors are chained the header and data > are placed in > - * separate buffers. > - */ > - if (desc->flags & VRING_DESC_F_NEXT) { > - desc->len =3D vq->vhost_hlen; > - desc =3D &vq->desc[desc->next]; > - /* Buffer address translation. */ > - buff_addr =3D gpa_to_vva(dev, desc->addr); > - desc->len =3D rte_pktmbuf_data_len(buff); > - } else { > - buff_addr +=3D vq->vhost_hlen; > - desc->len =3D packet_len; > - } > + /* > + * If the descriptors are chained the header and data are > + * placed in separate buffers. > + */ > + if (desc->flags & VRING_DESC_F_NEXT) { > + desc->len =3D vq->vhost_hlen; > + desc =3D &vq->desc[desc->next]; > + /* Buffer address translation. */ > + buff_addr =3D gpa_to_vva(dev, desc->addr); > + desc->len =3D rte_pktmbuf_data_len(buff); > + } else { > + buff_addr +=3D vq->vhost_hlen; > + desc->len =3D packet_len; > } >=20 > - PRINT_PACKET(dev, (uintptr_t)buff_addr, > rte_pktmbuf_data_len(buff), 0); > - > /* Update used ring with desc information */ > vq->used->ring[res_cur_idx & (vq->size - 1)].id =3D > head[packet_success]; > vq->used->ring[res_cur_idx & (vq->size - 1)].len =3D > packet_len; >=20 > /* Copy mbuf data to buffer */ > - rte_memcpy((void *)(uintptr_t)buff_addr, (const > void*)buff->pkt.data, rte_pktmbuf_data_len(buff)); > + rte_memcpy((void *)(uintptr_t)buff_addr, > + (const void *)buff->pkt.data, > + rte_pktmbuf_data_len(buff)); > + PRINT_PACKET(dev, (uintptr_t)buff_addr, > + rte_pktmbuf_data_len(buff), 0); >=20 > res_cur_idx++; > packet_success++; >=20 > - /* If mergeable is disabled then a header is required per > buffer. */ > - if (!mergeable) { > - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, > (const void*)&virtio_hdr, vq->vhost_hlen); > - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq- > >vhost_hlen, 1); > - } else { > - mrg_count++; > - /* Merge buffer can only handle so many buffers at a > time. Tell the guest if this limit is reached. */ > - if ((mrg_count =3D=3D MAX_MRG_PKT_BURST) || > (res_cur_idx =3D=3D res_end_idx)) { > - virtio_hdr.num_buffers =3D mrg_count; > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: > Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); > - rte_memcpy((void > *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen); > - PRINT_PACKET(dev, > (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); > - mrg_count =3D 0; > - } > - } > + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, > + (const void *)&virtio_hdr, vq->vhost_hlen); > + > + PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq- > >vhost_hlen, 1); > + > if (res_cur_idx < res_end_idx) { > /* Prefetch descriptor index. */ > rte_prefetch0(&vq->desc[head[packet_success]]); > @@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct > rte_mbuf **pkts, uint32_t count) > return count; > } >=20 > +static inline uint32_t __attribute__((always_inline)) > +copy_from_mbuf_to_vring(struct virtio_net *dev, > + uint16_t res_base_idx, uint16_t res_end_idx, > + struct rte_mbuf *pkt) > +{ > + uint32_t vec_idx =3D 0; > + uint32_t entry_success =3D 0; > + struct vhost_virtqueue *vq; > + /* The virtio_hdr is initialised to 0. */ > + struct virtio_net_hdr_mrg_rxbuf virtio_hdr =3D { > + {0, 0, 0, 0, 0, 0}, 0}; > + uint16_t cur_idx =3D res_base_idx; > + uint64_t vb_addr =3D 0; > + uint64_t vb_hdr_addr =3D 0; > + uint32_t seg_offset =3D 0; > + uint32_t vb_offset =3D 0; > + uint32_t seg_avail; > + uint32_t vb_avail; > + uint32_t cpy_len, entry_len; > + > + if (pkt =3D=3D NULL) > + return 0; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " > + "End Index %d\n", > + dev->device_fh, cur_idx, res_end_idx); > + > + /* > + * Convert from gpa to vva > + * (guest physical addr -> vhost virtual addr) > + */ > + vq =3D dev->virtqueue[VIRTIO_RXQ]; > + vb_addr =3D > + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); > + vb_hdr_addr =3D vb_addr; > + > + /* Prefetch buffer address. */ > + rte_prefetch0((void *)(uintptr_t)vb_addr); > + > + virtio_hdr.num_buffers =3D res_end_idx - res_base_idx; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge > buffers %d\n", > + dev->device_fh, virtio_hdr.num_buffers); > + > + rte_memcpy((void *)(uintptr_t)vb_hdr_addr, > + (const void *)&virtio_hdr, vq->vhost_hlen); > + > + PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); > + > + seg_avail =3D rte_pktmbuf_data_len(pkt); > + vb_offset =3D vq->vhost_hlen; > + vb_avail =3D > + vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; > + > + entry_len =3D vq->vhost_hlen; > + > + if (vb_avail =3D=3D 0) { > + uint32_t desc_idx =3D > + vq->buf_vec[vec_idx].desc_idx; > + vq->desc[desc_idx].len =3D vq->vhost_hlen; > + > + if ((vq->desc[desc_idx].flags > + & VRING_DESC_F_NEXT) =3D=3D 0) { > + /* Update used ring with desc information */ > + vq->used->ring[cur_idx & (vq->size - 1)].id > + =3D vq->buf_vec[vec_idx].desc_idx; > + vq->used->ring[cur_idx & (vq->size - 1)].len > + =3D entry_len; > + > + entry_len =3D 0; > + cur_idx++; > + entry_success++; > + } > + > + vec_idx++; > + vb_addr =3D > + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); > + > + /* Prefetch buffer address. */ > + rte_prefetch0((void *)(uintptr_t)vb_addr); > + vb_offset =3D 0; > + vb_avail =3D vq->buf_vec[vec_idx].buf_len; > + } > + > + cpy_len =3D RTE_MIN(vb_avail, seg_avail); > + > + while (cpy_len > 0) { > + /* Copy mbuf data to vring buffer */ > + rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), > + (const void *)(rte_pktmbuf_mtod(pkt, char*) + > seg_offset), > + cpy_len); > + > + PRINT_PACKET(dev, > + (uintptr_t)(vb_addr + vb_offset), > + cpy_len, 0); > + > + seg_offset +=3D cpy_len; > + vb_offset +=3D cpy_len; > + seg_avail -=3D cpy_len; > + vb_avail -=3D cpy_len; > + entry_len +=3D cpy_len; > + > + if (seg_avail !=3D 0) { > + /* > + * The virtio buffer in this vring > + * entry reach to its end. > + * But the segment doesn't complete. > + */ > + if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & > + VRING_DESC_F_NEXT) =3D=3D 0) { > + /* Update used ring with desc information */ > + vq->used->ring[cur_idx & (vq->size - 1)].id > + =3D vq->buf_vec[vec_idx].desc_idx; > + vq->used->ring[cur_idx & (vq->size - 1)].len > + =3D entry_len; > + entry_len =3D 0; > + cur_idx++; > + entry_success++; > + } > + > + vec_idx++; > + vb_addr =3D gpa_to_vva(dev, > + vq->buf_vec[vec_idx].buf_addr); > + vb_offset =3D 0; > + vb_avail =3D vq->buf_vec[vec_idx].buf_len; > + cpy_len =3D RTE_MIN(vb_avail, seg_avail); > + } else { > + /* > + * This current segment complete, need continue to > + * check if the whole packet complete or not. > + */ > + pkt =3D pkt->pkt.next; > + if (pkt !=3D NULL) { > + /* > + * There are more segments. > + */ > + if (vb_avail =3D=3D 0) { > + /* > + * This current buffer from vring is > + * used up, need fetch next buffer > + * from buf_vec. > + */ > + uint32_t desc_idx =3D > + vq- > >buf_vec[vec_idx].desc_idx; > + vq->desc[desc_idx].len =3D vb_offset; > + > + if ((vq->desc[desc_idx].flags & > + VRING_DESC_F_NEXT) =3D=3D 0) { > + uint16_t wrapped_idx =3D > + cur_idx & (vq->size - > 1); > + /* > + * Update used ring with the > + * descriptor information > + */ > + vq->used- > >ring[wrapped_idx].id > + =3D desc_idx; > + vq->used- > >ring[wrapped_idx].len > + =3D entry_len; > + entry_success++; > + entry_len =3D 0; > + cur_idx++; > + } > + > + /* Get next buffer from buf_vec. */ > + vec_idx++; > + vb_addr =3D gpa_to_vva(dev, > + vq- > >buf_vec[vec_idx].buf_addr); > + vb_avail =3D > + vq- > >buf_vec[vec_idx].buf_len; > + vb_offset =3D 0; > + } > + > + seg_offset =3D 0; > + seg_avail =3D rte_pktmbuf_data_len(pkt); > + cpy_len =3D RTE_MIN(vb_avail, seg_avail); > + } else { > + /* > + * This whole packet completes. > + */ > + uint32_t desc_idx =3D > + vq->buf_vec[vec_idx].desc_idx; > + vq->desc[desc_idx].len =3D vb_offset; > + > + while (vq->desc[desc_idx].flags & > + VRING_DESC_F_NEXT) { > + desc_idx =3D vq->desc[desc_idx].next; > + vq->desc[desc_idx].len =3D 0; > + } > + > + /* Update used ring with desc information */ > + vq->used->ring[cur_idx & (vq->size - 1)].id > + =3D vq->buf_vec[vec_idx].desc_idx; > + vq->used->ring[cur_idx & (vq->size - 1)].len > + =3D entry_len; > + entry_len =3D 0; > + cur_idx++; > + entry_success++; > + seg_avail =3D 0; > + cpy_len =3D RTE_MIN(vb_avail, seg_avail); > + } > + } > + } > + > + return entry_success; > +} > + > +/* > + * This function adds buffers to the virtio devices RX virtqueue. > +Buffers can > + * be received from the physical port or from another virtio device. A > +packet > + * count is returned to indicate the number of packets that were > +succesfully > + * added to the RX queue. This function works for mergeable RX. > + */ > +static inline uint32_t __attribute__((always_inline)) > +virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts, > + uint32_t count) > +{ > + struct vhost_virtqueue *vq; > + uint32_t pkt_idx =3D 0, entry_success =3D 0; > + uint32_t retry =3D 0; > + uint16_t avail_idx, res_cur_idx; > + uint16_t res_base_idx, res_end_idx; > + uint8_t success =3D 0; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", > + dev->device_fh); > + vq =3D dev->virtqueue[VIRTIO_RXQ]; > + count =3D RTE_MIN((uint32_t)MAX_PKT_BURST, count); > + > + if (count =3D=3D 0) > + return 0; > + > + for (pkt_idx =3D 0; pkt_idx < count; pkt_idx++) { > + uint32_t secure_len =3D 0; > + uint16_t need_cnt; > + uint32_t vec_idx =3D 0; > + uint32_t pkt_len =3D pkts[pkt_idx]->pkt.pkt_len + vq- > >vhost_hlen; > + uint16_t i, id; > + > + do { > + /* > + * As many data cores may want access to available > + * buffers, they need to be reserved. > + */ > + res_base_idx =3D vq->last_used_idx_res; > + res_cur_idx =3D res_base_idx; > + > + do { > + avail_idx =3D *((volatile uint16_t *)&vq->avail- > >idx); > + if (unlikely(res_cur_idx =3D=3D avail_idx)) { > + /* > + * If retry is enabled and the queue is > + * full then we wait and retry to avoid > + * packet loss. > + */ > + if (enable_retry) { > + uint8_t cont =3D 0; > + for (retry =3D 0; retry < > burst_rx_retry_num; retry++) { > + > rte_delay_us(burst_rx_delay_time); > + avail_idx =3D > + *((volatile > uint16_t *)&vq->avail->idx); > + if > (likely(res_cur_idx !=3D avail_idx)) { > + cont =3D 1; > + break; > + } > + } > + if (cont =3D=3D 1) > + continue; > + } > + > + LOG_DEBUG(VHOST_DATA, > + "(%"PRIu64") Failed " > + "to get enough desc from " > + "vring\n", > + dev->device_fh); > + return pkt_idx; > + } else { > + uint16_t wrapped_idx =3D > + (res_cur_idx) & (vq->size - 1); > + uint32_t idx =3D > + vq->avail->ring[wrapped_idx]; > + uint8_t next_desc; > + > + do { > + next_desc =3D 0; > + secure_len +=3D vq- > >desc[idx].len; > + if (vq->desc[idx].flags & > + > VRING_DESC_F_NEXT) { > + idx =3D vq- > >desc[idx].next; > + next_desc =3D 1; > + } > + } while (next_desc); > + > + res_cur_idx++; > + } > + } while (pkt_len > secure_len); > + > + /* vq->last_used_idx_res is atomically updated. */ > + success =3D rte_atomic16_cmpset(&vq- > >last_used_idx_res, > + res_base_idx, > + res_cur_idx); > + } while (success =3D=3D 0); > + > + id =3D res_base_idx; > + need_cnt =3D res_cur_idx - res_base_idx; > + > + for (i =3D 0; i < need_cnt; i++, id++) { > + uint16_t wrapped_idx =3D id & (vq->size - 1); > + uint32_t idx =3D vq->avail->ring[wrapped_idx]; > + uint8_t next_desc; > + do { > + next_desc =3D 0; > + vq->buf_vec[vec_idx].buf_addr =3D > + vq->desc[idx].addr; > + vq->buf_vec[vec_idx].buf_len =3D > + vq->desc[idx].len; > + vq->buf_vec[vec_idx].desc_idx =3D idx; > + vec_idx++; > + > + if (vq->desc[idx].flags & > VRING_DESC_F_NEXT) { > + idx =3D vq->desc[idx].next; > + next_desc =3D 1; > + } > + } while (next_desc); > + } > + > + res_end_idx =3D res_cur_idx; > + > + entry_success =3D copy_from_mbuf_to_vring(dev, > res_base_idx, > + res_end_idx, pkts[pkt_idx]); > + > + rte_compiler_barrier(); > + > + /* > + * Wait until it's our turn to add our buffer > + * to the used ring. > + */ > + while (unlikely(vq->last_used_idx !=3D res_base_idx)) > + rte_pause(); > + > + *(volatile uint16_t *)&vq->used->idx +=3D entry_success; > + vq->last_used_idx =3D res_end_idx; > + > + /* Kick the guest if necessary. */ > + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > + eventfd_write((int)vq->kickfd, 1); > + } > + > + return count; > +} > + > /* > * Compares a packet destination MAC address to a device MAC address. > */ > @@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct > rte_mbuf *m) > /*drop the packet if the device is marked for > removal*/ > LOG_DEBUG(VHOST_DATA, "(%"PRIu64") > Device is marked for removal\n", dev_ll->dev->device_fh); > } else { > + uint32_t mergeable =3D > + dev_ll->dev->features & > + (1 << VIRTIO_NET_F_MRG_RXBUF); > + > /*send the packet to the local virtio device*/ > - ret =3D virtio_dev_rx(dev_ll->dev, &m, 1); > + if (likely(mergeable =3D=3D 0)) > + ret =3D virtio_dev_rx(dev_ll->dev, &m, > 1); > + else > + ret =3D virtio_dev_merge_rx(dev_ll- > >dev, > + &m, 1); > + > if (enable_stats) { > rte_atomic64_add( > &dev_statistics[dev_ll->dev- > >device_fh].rx_total_atomic, > @@ -1231,7 +1589,7 @@ virtio_tx_route(struct virtio_net* dev, struct > rte_mbuf *m, struct rte_mempool * > struct mbuf_table *tx_q; > struct vlan_ethhdr *vlan_hdr; > struct rte_mbuf **m_table; > - struct rte_mbuf *mbuf; > + struct rte_mbuf *mbuf, *prev; > unsigned len, ret, offset =3D 0; > const uint16_t lcore_id =3D rte_lcore_id(); > struct virtio_net_data_ll *dev_ll =3D ll_root_used; @@ -1284,12 > +1642,14 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, > struct rte_mempool * > /* Allocate an mbuf and populate the structure. */ > mbuf =3D rte_pktmbuf_alloc(mbuf_pool); > if (unlikely(mbuf =3D=3D NULL)) { > - RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for > mbuf.\n"); > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > return; > } >=20 > mbuf->pkt.data_len =3D m->pkt.data_len + VLAN_HLEN + offset; > - mbuf->pkt.pkt_len =3D mbuf->pkt.data_len; > + mbuf->pkt.pkt_len =3D m->pkt.pkt_len + VLAN_HLEN + offset; > + mbuf->pkt.nb_segs =3D m->pkt.nb_segs; >=20 > /* Copy ethernet header to mbuf. */ > rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, > ETH_HLEN); @@ -1304,6 +1664,29 @@ virtio_tx_route(struct virtio_net* dev, > struct rte_mbuf *m, struct rte_mempool * > /* Copy the remaining packet contents to the mbuf. */ > rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN), > (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m- > >pkt.data_len - ETH_HLEN)); > + > + /* Copy the remaining segments for the whole packet. */ > + prev =3D mbuf; > + while (m->pkt.next) { > + /* Allocate an mbuf and populate the structure. */ > + struct rte_mbuf *next_mbuf =3D > rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(next_mbuf =3D=3D NULL)) { > + rte_pktmbuf_free(mbuf); > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > + return; > + } > + > + m =3D m->pkt.next; > + prev->pkt.next =3D next_mbuf; > + prev =3D next_mbuf; > + next_mbuf->pkt.data_len =3D m->pkt.data_len; > + > + /* Copy data to next mbuf. */ > + rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), > + rte_pktmbuf_mtod(m, const void *), m- > >pkt.data_len); > + } > + > tx_q->m_table[len] =3D mbuf; > len++; > if (enable_stats) { > @@ -1394,6 +1777,7 @@ virtio_dev_tx(struct virtio_net* dev, struct > rte_mempool *mbuf_pool) >=20 > /* Setup dummy mbuf. This is copied to a real mbuf if > transmitted out the physical port. */ > m.pkt.data_len =3D desc->len; > + m.pkt.pkt_len =3D desc->len; > m.pkt.data =3D (void*)(uintptr_t)buff_addr; >=20 > PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); @@ > -1420,6 +1804,227 @@ virtio_dev_tx(struct virtio_net* dev, struct > rte_mempool *mbuf_pool) > eventfd_write((int)vq->kickfd, 1); > } >=20 > +/* This function works for TX packets with mergeable feature enabled. > +*/ static inline void __attribute__((always_inline)) > +virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool > +*mbuf_pool) { > + struct rte_mbuf *m, *prev; > + struct vhost_virtqueue *vq; > + struct vring_desc *desc; > + uint64_t vb_addr =3D 0; > + uint32_t head[MAX_PKT_BURST]; > + uint32_t used_idx; > + uint32_t i; > + uint16_t free_entries, entry_success =3D 0; > + uint16_t avail_idx; > + uint32_t buf_size =3D MBUF_SIZE - (sizeof(struct rte_mbuf) > + + RTE_PKTMBUF_HEADROOM); > + > + vq =3D dev->virtqueue[VIRTIO_TXQ]; > + avail_idx =3D *((volatile uint16_t *)&vq->avail->idx); > + > + /* If there are no available buffers then return. */ > + if (vq->last_used_idx =3D=3D avail_idx) > + return; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n", > + dev->device_fh); > + > + /* Prefetch available ring to retrieve head indexes. */ > + rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); > + > + /*get the number of free entries in the ring*/ > + free_entries =3D (avail_idx - vq->last_used_idx); > + > + /* Limit to MAX_PKT_BURST. */ > + free_entries =3D RTE_MIN(free_entries, MAX_PKT_BURST); > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", > + dev->device_fh, free_entries); > + /* Retrieve all of the head indexes first to avoid caching issues. */ > + for (i =3D 0; i < free_entries; i++) > + head[i] =3D vq->avail->ring[(vq->last_used_idx + i) & (vq->size - > 1)]; > + > + /* Prefetch descriptor index. */ > + rte_prefetch0(&vq->desc[head[entry_success]]); > + rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); > + > + while (entry_success < free_entries) { > + uint32_t vb_avail, vb_offset; > + uint32_t seg_avail, seg_offset; > + uint32_t cpy_len; > + uint32_t seg_num =3D 0; > + struct rte_mbuf *cur; > + uint8_t alloc_err =3D 0; > + > + desc =3D &vq->desc[head[entry_success]]; > + > + /* Discard first buffer as it is the virtio header */ > + desc =3D &vq->desc[desc->next]; > + > + /* Buffer address translation. */ > + vb_addr =3D gpa_to_vva(dev, desc->addr); > + /* Prefetch buffer address. */ > + rte_prefetch0((void *)(uintptr_t)vb_addr); > + > + used_idx =3D vq->last_used_idx & (vq->size - 1); > + > + if (entry_success < (free_entries - 1)) { > + /* Prefetch descriptor index. */ > + rte_prefetch0(&vq->desc[head[entry_success+1]]); > + rte_prefetch0(&vq->used->ring[(used_idx + 1) & > (vq->size - 1)]); > + } > + > + /* Update used index buffer information. */ > + vq->used->ring[used_idx].id =3D head[entry_success]; > + vq->used->ring[used_idx].len =3D 0; > + > + vb_offset =3D 0; > + vb_avail =3D desc->len; > + seg_offset =3D 0; > + seg_avail =3D buf_size; > + cpy_len =3D RTE_MIN(vb_avail, seg_avail); > + > + PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); > + > + /* Allocate an mbuf and populate the structure. */ > + m =3D rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(m =3D=3D NULL)) { > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > + return; > + } > + > + seg_num++; > + cur =3D m; > + prev =3D m; > + while (cpy_len !=3D 0) { > + rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) > + seg_offset), > + (void *)((uintptr_t)(vb_addr + vb_offset)), > + cpy_len); > + > + seg_offset +=3D cpy_len; > + vb_offset +=3D cpy_len; > + vb_avail -=3D cpy_len; > + seg_avail -=3D cpy_len; > + > + if (vb_avail !=3D 0) { > + /* > + * The segment reachs to its end, > + * while the virtio buffer in TX vring has > + * more data to be copied. > + */ > + cur->pkt.data_len =3D seg_offset; > + m->pkt.pkt_len +=3D seg_offset; > + /* Allocate mbuf and populate the structure. > */ > + cur =3D rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(cur =3D=3D NULL)) { > + RTE_LOG(ERR, VHOST_DATA, "Failed > to " > + "allocate memory for > mbuf.\n"); > + rte_pktmbuf_free(m); > + alloc_err =3D 1; > + break; > + } > + > + seg_num++; > + prev->pkt.next =3D cur; > + prev =3D cur; > + seg_offset =3D 0; > + seg_avail =3D buf_size; > + } else { > + if (desc->flags & VRING_DESC_F_NEXT) { > + /* > + * There are more virtio buffers in > + * same vring entry need to be copied. > + */ > + if (seg_avail =3D=3D 0) { > + /* > + * The current segment hasn't > + * room to accomodate more > + * data. > + */ > + cur->pkt.data_len =3D > seg_offset; > + m->pkt.pkt_len +=3D > seg_offset; > + /* > + * Allocate an mbuf and > + * populate the structure. > + */ > + cur =3D > rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(cur =3D=3D NULL)) { > + RTE_LOG(ERR, > + VHOST_DATA, > + "Failed to " > + "allocate > memory " > + "for mbuf\n"); > + > rte_pktmbuf_free(m); > + alloc_err =3D 1; > + break; > + } > + seg_num++; > + prev->pkt.next =3D cur; > + prev =3D cur; > + seg_offset =3D 0; > + seg_avail =3D buf_size; > + } > + > + desc =3D &vq->desc[desc->next]; > + > + /* Buffer address translation. */ > + vb_addr =3D gpa_to_vva(dev, desc- > >addr); > + /* Prefetch buffer address. */ > + rte_prefetch0((void > *)(uintptr_t)vb_addr); > + vb_offset =3D 0; > + vb_avail =3D desc->len; > + > + PRINT_PACKET(dev, > (uintptr_t)vb_addr, > + desc->len, 0); > + } else { > + /* The whole packet completes. */ > + cur->pkt.data_len =3D seg_offset; > + m->pkt.pkt_len +=3D seg_offset; > + vb_avail =3D 0; > + } > + } > + > + cpy_len =3D RTE_MIN(vb_avail, seg_avail); > + } > + > + if (unlikely(alloc_err =3D=3D 1)) > + break; > + > + m->pkt.nb_segs =3D seg_num; > + > + /* > + * If this is the first received packet we need to learn > + * the MAC and setup VMDQ > + */ > + if (dev->ready =3D=3D DEVICE_MAC_LEARNING) { > + if (dev->remove || (link_vmdq(dev, m) =3D=3D -1)) { > + /* > + * Discard frame if device is scheduled for > + * removal or a duplicate MAC address is > found. > + */ > + entry_success =3D free_entries; > + vq->last_used_idx +=3D entry_success; > + rte_pktmbuf_free(m); > + break; > + } > + } > + > + virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev- > >device_fh); > + vq->last_used_idx++; > + entry_success++; > + rte_pktmbuf_free(m); > + } > + > + rte_compiler_barrier(); > + vq->used->idx +=3D entry_success; > + /* Kick guest if required. */ > + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > + eventfd_write((int)vq->kickfd, 1); > + > +} > + > /* > * This function is called by each data core. It handles all RX/TX regis= tered > with the > * core. For TX the specific lcore linked list is used. For RX, MAC addr= esses are > compared @@ -1440,8 +2045,9 @@ switch_worker(__attribute__((unused)) > void *arg) > const uint16_t lcore_id =3D rte_lcore_id(); > const uint16_t num_cores =3D (uint16_t)rte_lcore_count(); > uint16_t rx_count =3D 0; > + uint32_t mergeable =3D 0; >=20 > - RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", > lcore_id); > + RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", > lcore_id); > lcore_ll =3D lcore_info[lcore_id].lcore_ll; > prev_tsc =3D 0; >=20 > @@ -1497,6 +2103,8 @@ switch_worker(__attribute__((unused)) void *arg) > while (dev_ll !=3D NULL) { > /*get virtio device ID*/ > dev =3D dev_ll->dev; > + mergeable =3D > + dev->features & (1 << > VIRTIO_NET_F_MRG_RXBUF); >=20 > if (dev->remove) { > dev_ll =3D dev_ll->next; > @@ -1510,7 +2118,15 @@ switch_worker(__attribute__((unused)) void *arg) > (uint16_t)dev->vmdq_rx_q, > pkts_burst, MAX_PKT_BURST); >=20 > if (rx_count) { > - ret_count =3D virtio_dev_rx(dev, > pkts_burst, rx_count); > + if (likely(mergeable =3D=3D 0)) > + ret_count =3D > + virtio_dev_rx(dev, > + pkts_burst, rx_count); > + else > + ret_count =3D > + > virtio_dev_merge_rx(dev, > + pkts_burst, rx_count); > + > if (enable_stats) { > rte_atomic64_add( > &dev_statistics[dev_ll->dev- > >device_fh].rx_total_atomic, > @@ -1520,15 +2136,19 @@ switch_worker(__attribute__((unused)) void > *arg) > } > while (likely(rx_count)) { > rx_count--; > - > rte_pktmbuf_free_seg(pkts_burst[rx_count]); > + > rte_pktmbuf_free(pkts_burst[rx_count]); > } >=20 > } > } >=20 > - if (!dev->remove) > + if (!dev->remove) { > /*Handle guest TX*/ > - virtio_dev_tx(dev, mbuf_pool); > + if (likely(mergeable =3D=3D 0)) > + virtio_dev_tx(dev, mbuf_pool); > + else > + virtio_dev_merge_tx(dev, > mbuf_pool); > + } >=20 > /*move to the next device in the list*/ > dev_ll =3D dev_ll->next; > diff --git a/examples/vhost/virtio-net.h b/examples/vhost/virtio-net.h in= dex > 3d1f255..1a2f0dc 100644 > --- a/examples/vhost/virtio-net.h > +++ b/examples/vhost/virtio-net.h > @@ -45,6 +45,18 @@ > /* Enum for virtqueue management. */ > enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; >=20 > +#define BUF_VECTOR_MAX 256 > + > +/* > + * Structure contains buffer address, length and descriptor index > + * from vring to do scatter RX. > +*/ > +struct buf_vector { > +uint64_t buf_addr; > +uint32_t buf_len; > +uint32_t desc_idx; > +}; > + > /* > * Structure contains variables relevant to TX/RX virtqueues. > */ > @@ -60,6 +72,8 @@ struct vhost_virtqueue > volatile uint16_t last_used_idx_res; /* Used for multiple > devices reserving buffers. */ > eventfd_t callfd; /* > Currently unused as polling mode is enabled. */ > eventfd_t kickfd; /* > Used to notify the guest (trigger interrupt). */ > + /* Used for scatter RX. */ > + struct buf_vector buf_vec[BUF_VECTOR_MAX]; > } __rte_cache_aligned; >=20 > /* > -- > 1.8.4.2