From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hxie5@shecgisg003.sh.intel.com>
Received: from mga14.intel.com (mga14.intel.com [192.55.52.115])
 by dpdk.org (Postfix) with ESMTP id C39007DF7
 for <dev@dpdk.org>; Fri, 26 Sep 2014 11:40:23 +0200 (CEST)
Received: from azsmga001.ch.intel.com ([10.2.17.19])
 by fmsmga103.fm.intel.com with ESMTP; 26 Sep 2014 02:37:30 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.04,603,1406617200"; d="scan'208";a="480166491"
Received: from shvmail01.sh.intel.com ([10.239.29.42])
 by azsmga001.ch.intel.com with ESMTP; 26 Sep 2014 02:46:41 -0700
Received: from shecgisg003.sh.intel.com (shecgisg003.sh.intel.com
 [10.239.29.90])
 by shvmail01.sh.intel.com with ESMTP id s8Q9keaA006911;
 Fri, 26 Sep 2014 17:46:40 +0800
Received: from shecgisg003.sh.intel.com (localhost [127.0.0.1])
 by shecgisg003.sh.intel.com (8.13.6/8.13.6/SuSE Linux 0.8) with ESMTP id
 s8Q9kbLT027551; Fri, 26 Sep 2014 17:46:39 +0800
Received: (from hxie5@localhost)
 by shecgisg003.sh.intel.com (8.13.6/8.13.6/Submit) id s8Q9kbjE027547;
 Fri, 26 Sep 2014 17:46:37 +0800
From: Huawei Xie <huawei.xie@intel.com>
To: dev@dpdk.org
Date: Fri, 26 Sep 2014 17:45:51 +0800
Message-Id: <1411724758-27488-5-git-send-email-huawei.xie@intel.com>
X-Mailer: git-send-email 1.7.4.1
In-Reply-To: <1411724758-27488-1-git-send-email-huawei.xie@intel.com>
References: <1411724758-27488-1-git-send-email-huawei.xie@intel.com>
Subject: [dpdk-dev] [PATCH v5 04/11] lib/librte_vhost: merge vhost
	merge-able rx. merge vhost tx fix.
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Fri, 26 Sep 2014 09:40:25 -0000

Merge vhost merge-able rx.
For vhost tx, previous vhost merge-able feature introduces virtio_dev_merge_tx,
and calls virtio_dev_tx and vritio_dev_merge_tx respectively depends on whether
the vhost device supports merge-able feature.
There is no so called merge-tx, it is actually fix for memcpy from chained vring
desc to chained mbuf.
Use virtio_dev_merge_tx as the base for vhost tx.

Signed-off-by: Huawei Xie <huawei.xie@intel.com>
---
 lib/librte_vhost/rte_virtio_net.h |  16 +-
 lib/librte_vhost/vhost_rxtx.c     | 568 +++++++++++++++++++++++++++++++++-----
 2 files changed, 511 insertions(+), 73 deletions(-)

diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
index 08dc6f4..99ddfc1 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -53,9 +53,18 @@
 /* Enum for virtqueue management. */
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
 
-
-/*
- * Structure contains variables relevant to TX/RX virtqueues.
+#define BUF_VECTOR_MAX 256
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+	uint64_t buf_addr;
+	uint32_t buf_len;
+	uint32_t desc_idx;
+};
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
  */
 struct vhost_virtqueue
 {
@@ -69,6 +78,7 @@ struct vhost_virtqueue
 	volatile uint16_t	last_used_idx_res;	/* Used for multiple devices reserving buffers. */
 	eventfd_t			callfd;				/* Currently unused as polling mode is enabled. */
 	eventfd_t			kickfd;				/* Used to notify the guest (trigger interrupt). */
+	struct buf_vector    buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */
 } __rte_cache_aligned;
 
 
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 0d96c43..81368e6 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -49,8 +49,8 @@
  * count is returned to indicate the number of packets that were succesfully
  * added to the RX queue. This function works when mergeable is disabled.
  */
-uint32_t
-rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count)
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count)
 {
 	struct vhost_virtqueue *vq;
 	struct vring_desc *desc;
@@ -61,7 +61,6 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mb
 	uint64_t buff_hdr_addr = 0;
 	uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0;
 	uint32_t head_idx, packet_success = 0;
-	uint32_t mergeable, mrg_count = 0;
 	uint16_t avail_idx, res_cur_idx;
 	uint16_t res_base_idx, res_end_idx;
 	uint16_t free_entries;
@@ -101,9 +100,6 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mb
 	/* Prefetch available ring to retrieve indexes. */
 	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
 
-	/* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
-	mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
-
 	/* Retrieve all of the head indexes first to avoid caching issues. */
 	for (head_idx = 0; head_idx < count; head_idx++)
 		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
@@ -122,27 +118,23 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mb
 		/* Prefetch buffer address. */
 		rte_prefetch0((void*)(uintptr_t)buff_addr);
 
-		if (mergeable && (mrg_count != 0)) {
-			desc->len = packet_len = rte_pktmbuf_data_len(buff);
+		/* Copy virtio_hdr to packet and increment buffer address */
+		buff_hdr_addr = buff_addr;
+		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
+
+		/*
+		 * If the descriptors are chained the header and data are placed in
+		 * separate buffers.
+		 */
+		if (desc->flags & VRING_DESC_F_NEXT) {
+			desc->len = vq->vhost_hlen;
+			desc = &vq->desc[desc->next];
+			/* Buffer address translation. */
+			buff_addr = gpa_to_vva(dev, desc->addr);
+			desc->len = rte_pktmbuf_data_len(buff);
 		} else {
-			/* Copy virtio_hdr to packet and increment buffer address */
-			buff_hdr_addr = buff_addr;
-			packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
-
-			/*
-			 * If the descriptors are chained the header and data are placed in
-			 * separate buffers.
-			 */
-			if (desc->flags & VRING_DESC_F_NEXT) {
-				desc->len = vq->vhost_hlen;
-				desc = &vq->desc[desc->next];
-				/* Buffer address translation. */
-				buff_addr = gpa_to_vva(dev, desc->addr);
-				desc->len = rte_pktmbuf_data_len(buff);
-			} else {
-				buff_addr += vq->vhost_hlen;
-				desc->len = packet_len;
-			}
+			buff_addr += vq->vhost_hlen;
+			desc->len = packet_len;
 		}
 
 
@@ -161,21 +153,9 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mb
 		res_cur_idx++;
 		packet_success++;
 	
-		/* If mergeable is disabled then a header is required per buffer. */
-		if (!mergeable) {
-			rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen);
-			VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-		} else {
-			mrg_count++;
-			/* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
-			if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
-				virtio_hdr.num_buffers = mrg_count;
-				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
-				rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen);
-				VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-				mrg_count = 0;
-			}
-		}
+		rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr,
+				vq->vhost_hlen);
+		VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
 		if (res_cur_idx < res_end_idx) {
 			/* Prefetch descriptor index. */
 			rte_prefetch0(&vq->desc[head[packet_success]]);
@@ -197,18 +177,357 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mb
 	return count;
 }
 
+static inline uint32_t __attribute__((always_inline))
+copy_from_mbuf_to_vring(struct virtio_net *dev, uint16_t res_base_idx,
+	uint16_t res_end_idx, struct rte_mbuf *pkt)
+{
+	uint32_t vec_idx = 0;
+	uint32_t entry_success = 0;
+	struct vhost_virtqueue *vq;
+	/* The virtio_hdr is initialised to 0. */
+	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
+		{0, 0, 0, 0, 0, 0}, 0};
+	uint16_t cur_idx = res_base_idx;
+	uint64_t vb_addr = 0;
+	uint64_t vb_hdr_addr = 0;
+	uint32_t seg_offset = 0;
+	uint32_t vb_offset = 0;
+	uint32_t seg_avail;
+	uint32_t vb_avail;
+	uint32_t cpy_len, entry_len;
+
+	if (pkt == NULL)
+		return 0;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
+		"End Index %d\n",
+		dev->device_fh, cur_idx, res_end_idx);
+
+	/*
+	 * Convert from gpa to vva
+	 * (guest physical addr -> vhost virtual addr)
+	 */
+	vq = dev->virtqueue[VIRTIO_RXQ];
+	vb_addr =
+		gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+	vb_hdr_addr = vb_addr;
+
+	/* Prefetch buffer address. */
+	rte_prefetch0((void *)(uintptr_t)vb_addr);
+
+	virtio_hdr.num_buffers = res_end_idx - res_base_idx;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
+		dev->device_fh, virtio_hdr.num_buffers);
+
+	rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
+		(const void *)&virtio_hdr, vq->vhost_hlen);
+
+	VHOST_PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
+
+	seg_avail = rte_pktmbuf_data_len(pkt);
+	vb_offset = vq->vhost_hlen;
+	vb_avail =
+		vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
+
+	entry_len = vq->vhost_hlen;
+
+	if (vb_avail == 0) {
+		uint32_t desc_idx =
+			vq->buf_vec[vec_idx].desc_idx;
+		vq->desc[desc_idx].len = vq->vhost_hlen;
+
+		if ((vq->desc[desc_idx].flags
+			& VRING_DESC_F_NEXT) == 0) {
+			/* Update used ring with desc information */
+			vq->used->ring[cur_idx & (vq->size - 1)].id
+				= vq->buf_vec[vec_idx].desc_idx;
+			vq->used->ring[cur_idx & (vq->size - 1)].len
+				= entry_len;
+
+			entry_len = 0;
+			cur_idx++;
+			entry_success++;
+		}
+
+		vec_idx++;
+		vb_addr =
+			gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+
+		/* Prefetch buffer address. */
+		rte_prefetch0((void *)(uintptr_t)vb_addr);
+		vb_offset = 0;
+		vb_avail = vq->buf_vec[vec_idx].buf_len;
+	}
+
+	cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+	while (cpy_len > 0) {
+		/* Copy mbuf data to vring buffer */
+		rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
+			(const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
+			cpy_len);
+
+		VHOST_PRINT_PACKET(dev,
+			(uintptr_t)(vb_addr + vb_offset),
+			cpy_len, 0);
+
+		seg_offset += cpy_len;
+		vb_offset += cpy_len;
+		seg_avail -= cpy_len;
+		vb_avail -= cpy_len;
+		entry_len += cpy_len;
+
+		if (seg_avail != 0) {
+			/*
+			 * The virtio buffer in this vring
+			 * entry reach to its end.
+			 * But the segment doesn't complete.
+			 */
+			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
+				VRING_DESC_F_NEXT) == 0) {
+				/* Update used ring with desc information */
+				vq->used->ring[cur_idx & (vq->size - 1)].id
+					= vq->buf_vec[vec_idx].desc_idx;
+				vq->used->ring[cur_idx & (vq->size - 1)].len
+					= entry_len;
+				entry_len = 0;
+				cur_idx++;
+				entry_success++;
+			}
+
+			vec_idx++;
+			vb_addr = gpa_to_vva(dev,
+				vq->buf_vec[vec_idx].buf_addr);
+			vb_offset = 0;
+			vb_avail = vq->buf_vec[vec_idx].buf_len;
+			cpy_len = RTE_MIN(vb_avail, seg_avail);
+		} else {
+			/*
+			 * This current segment complete, need continue to
+			 * check if the whole packet complete or not.
+			 */
+			pkt = pkt->pkt.next;
+			if (pkt != NULL) {
+				/*
+				 * There are more segments.
+				 */
+				if (vb_avail == 0) {
+					/*
+					 * This current buffer from vring is
+					 * used up, need fetch next buffer
+					 * from buf_vec.
+					 */
+					uint32_t desc_idx =
+						vq->buf_vec[vec_idx].desc_idx;
+					vq->desc[desc_idx].len = vb_offset;
+
+					if ((vq->desc[desc_idx].flags &
+						VRING_DESC_F_NEXT) == 0) {
+						uint16_t wrapped_idx =
+							cur_idx & (vq->size - 1);
+						/*
+						 * Update used ring with the
+						 * descriptor information
+						 */
+						vq->used->ring[wrapped_idx].id
+							= desc_idx;
+						vq->used->ring[wrapped_idx].len
+							= entry_len;
+						entry_success++;
+						entry_len = 0;
+						cur_idx++;
+					}
+
+					/* Get next buffer from buf_vec. */
+					vec_idx++;
+					vb_addr = gpa_to_vva(dev,
+						vq->buf_vec[vec_idx].buf_addr);
+					vb_avail =
+						vq->buf_vec[vec_idx].buf_len;
+					vb_offset = 0;
+				}
+
+				seg_offset = 0;
+				seg_avail = rte_pktmbuf_data_len(pkt);
+				cpy_len = RTE_MIN(vb_avail, seg_avail);
+			} else {
+				/*
+				 * This whole packet completes.
+				 */
+				uint32_t desc_idx =
+					vq->buf_vec[vec_idx].desc_idx;
+				vq->desc[desc_idx].len = vb_offset;
+
+				while (vq->desc[desc_idx].flags &
+					VRING_DESC_F_NEXT) {
+					desc_idx = vq->desc[desc_idx].next;
+					 vq->desc[desc_idx].len = 0;
+				}
+
+				/* Update used ring with desc information */
+				vq->used->ring[cur_idx & (vq->size - 1)].id
+					= vq->buf_vec[vec_idx].desc_idx;
+				vq->used->ring[cur_idx & (vq->size - 1)].len
+					= entry_len;
+				entry_len = 0;
+				cur_idx++;
+				entry_success++;
+				seg_avail = 0;
+				cpy_len = RTE_MIN(vb_avail, seg_avail);
+			}
+		}
+	}
+
+	return entry_success;
+}
+
+/*
+ * This function works for mergeable RX.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts,
+	uint32_t count)
+{
+	struct vhost_virtqueue *vq;
+	uint32_t pkt_idx = 0, entry_success = 0;
+	uint16_t avail_idx, res_cur_idx;
+	uint16_t res_base_idx, res_end_idx;
+	uint8_t success = 0;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
+		dev->device_fh);
+	if (unlikely(queue_id != VIRTIO_RXQ)) {
+		LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
+	}
+
+	vq = dev->virtqueue[VIRTIO_RXQ];
+	count = RTE_MIN((uint32_t)VHOST_MAX_PKT_BURST, count);
+
+	if (count == 0)
+		return 0;
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint32_t secure_len = 0;
+		uint16_t need_cnt;
+		uint32_t vec_idx = 0;
+		uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq->vhost_hlen;
+		uint16_t i, id;
+
+		do {
+			/*
+			 * As many data cores may want access to available
+			 * buffers, they need to be reserved.
+			 */
+			res_base_idx = vq->last_used_idx_res;
+			res_cur_idx = res_base_idx;
+
+			do {
+				avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+				if (unlikely(res_cur_idx == avail_idx)) {
+					LOG_DEBUG(VHOST_DATA,
+						"(%"PRIu64") Failed "
+						"to get enough desc from "
+						"vring\n",
+						dev->device_fh);
+					return pkt_idx;
+				} else {
+					uint16_t wrapped_idx =
+						(res_cur_idx) & (vq->size - 1);
+					uint32_t idx =
+						vq->avail->ring[wrapped_idx];
+					uint8_t next_desc;
+
+					do {
+						next_desc = 0;
+						secure_len += vq->desc[idx].len;
+						if (vq->desc[idx].flags &
+							VRING_DESC_F_NEXT) {
+							idx = vq->desc[idx].next;
+							next_desc = 1;
+						}
+					} while (next_desc);
+
+					res_cur_idx++;
+				}
+			} while (pkt_len > secure_len);
+
+			/* vq->last_used_idx_res is atomically updated. */
+			success = rte_atomic16_cmpset(&vq->last_used_idx_res,
+							res_base_idx,
+							res_cur_idx);
+		} while (success == 0);
+
+		id = res_base_idx;
+		need_cnt = res_cur_idx - res_base_idx;
+
+		for (i = 0; i < need_cnt; i++, id++) {
+			uint16_t wrapped_idx = id & (vq->size - 1);
+			uint32_t idx = vq->avail->ring[wrapped_idx];
+			uint8_t next_desc;
+			do {
+				next_desc = 0;
+				vq->buf_vec[vec_idx].buf_addr =
+					vq->desc[idx].addr;
+				vq->buf_vec[vec_idx].buf_len =
+					vq->desc[idx].len;
+				vq->buf_vec[vec_idx].desc_idx = idx;
+				vec_idx++;
+
+				if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
+					idx = vq->desc[idx].next;
+					next_desc = 1;
+				}
+			} while (next_desc);
+		}
+
+		res_end_idx = res_cur_idx;
+
+		entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
+			res_end_idx, pkts[pkt_idx]);
+
+		rte_compiler_barrier();
+
+		/*
+		 * Wait until it's our turn to add our buffer
+		 * to the used ring.
+		 */
+		while (unlikely(vq->last_used_idx != res_base_idx))
+			rte_pause();
+
+		*(volatile uint16_t *)&vq->used->idx += entry_success;
+		vq->last_used_idx = res_end_idx;
+
+		/* Kick the guest if necessary. */
+		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+			eventfd_write((int)vq->kickfd, 1);
+	}
+
+	return count;
+}
+
+uint32_t
+rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count)
+{
+	if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
+		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
+	else
+		return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+
 
 uint32_t
 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count)
 {
-	struct rte_mbuf *mbuf;
+	struct rte_mbuf *m, *prev;
 	struct vhost_virtqueue *vq;
 	struct vring_desc *desc;
-	uint64_t buff_addr = 0;
+	uint64_t vb_addr = 0;
 	uint32_t head[VHOST_MAX_PKT_BURST];
 	uint32_t used_idx;
 	uint32_t i;
-	uint16_t free_entries, packet_success = 0;
+	uint16_t free_entries, entry_success = 0;
 	uint16_t avail_idx;
 
 	if (unlikely(queue_id != VIRTIO_TXQ)) {
@@ -223,7 +542,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_me
 	if (vq->last_used_idx == avail_idx)
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s()\n", __func__,
+		dev->device_fh);
 
 	/* Prefetch available ring to retrieve head indexes. */
 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
@@ -231,11 +551,9 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_me
 	/*get the number of free entries in the ring*/
 	free_entries = (avail_idx - vq->last_used_idx);
 
-	if (free_entries > count)
-		free_entries = count;
+	free_entries = RTE_MIN(free_entries, count);
 	/* Limit to MAX_PKT_BURST. */
-	if (free_entries > VHOST_MAX_PKT_BURST)
-		free_entries = VHOST_MAX_PKT_BURST;
+	free_entries = RTE_MIN(free_entries, VHOST_MAX_PKT_BURST);
 
 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
 	/* Retrieve all of the head indexes first to avoid caching issues. */
@@ -243,56 +561,166 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_me
 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
 
 	/* Prefetch descriptor index. */
-	rte_prefetch0(&vq->desc[head[packet_success]]);
+	rte_prefetch0(&vq->desc[head[entry_success]]);
 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
 
-	while (packet_success < free_entries) {
-		desc = &vq->desc[head[packet_success]];
+	while (entry_success < free_entries) {
+		uint32_t vb_avail, vb_offset;
+		uint32_t seg_avail, seg_offset;
+		uint32_t cpy_len;
+		uint32_t seg_num = 0;
+		struct rte_mbuf *cur;
+		uint8_t alloc_err = 0;
+
+		desc = &vq->desc[head[entry_success]];
 
 		/* Discard first buffer as it is the virtio header */
 		desc = &vq->desc[desc->next];
 
 		/* Buffer address translation. */
-		buff_addr = gpa_to_vva(dev, desc->addr);
+		vb_addr = gpa_to_vva(dev, desc->addr);
 		/* Prefetch buffer address. */
-		rte_prefetch0((void*)(uintptr_t)buff_addr);
+		rte_prefetch0((void *)(uintptr_t)vb_addr);
 
 		used_idx = vq->last_used_idx & (vq->size - 1);
 
-		if (packet_success < (free_entries - 1)) {
+		if (entry_success < (free_entries - 1)) {
 			/* Prefetch descriptor index. */
-			rte_prefetch0(&vq->desc[head[packet_success+1]]);
+			rte_prefetch0(&vq->desc[head[entry_success+1]]);
 			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
 		}
 
 		/* Update used index buffer information. */
-		vq->used->ring[used_idx].id = head[packet_success];
+		vq->used->ring[used_idx].id = head[entry_success];
 		vq->used->ring[used_idx].len = 0;
 
-		mbuf = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(mbuf == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n");
-			return packet_success;
+		vb_offset = 0;
+		vb_avail = desc->len;
+		
+		seg_avail = 0;
+		/* Allocate an mbuf and populate the structure. */
+		m = rte_pktmbuf_alloc(mbuf_pool);
+		if (unlikely(m == NULL)) {
+			RTE_LOG(ERR, VHOST_DATA,
+				"Failed to allocate memory for mbuf.\n");
+			return entry_success;
 		}
-		mbuf->pkt.data_len = desc->len;
-		mbuf->pkt.pkt_len  = mbuf->pkt.data_len;
+		seg_offset = 0;
+		seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
+		cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+		VHOST_PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
+
+		
+		seg_num++;
+		cur = m;
+		prev = m;
+		while (cpy_len != 0) {
+			rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
+				(void *)((uintptr_t)(vb_addr + vb_offset)),
+				cpy_len);
+
+			seg_offset += cpy_len;
+			vb_offset += cpy_len;
+			vb_avail -= cpy_len;
+			seg_avail -= cpy_len;
+
+			if (vb_avail != 0) {
+				/*
+				 * The segment reachs to its end,
+				 * while the virtio buffer in TX vring has
+				 * more data to be copied.
+				 */
+				cur->pkt.data_len = seg_offset;
+				m->pkt.pkt_len += seg_offset;
+				/* Allocate mbuf and populate the structure. */
+				cur = rte_pktmbuf_alloc(mbuf_pool);
+				if (unlikely(cur == NULL)) {
+					RTE_LOG(ERR, VHOST_DATA, "Failed to "
+						"allocate memory for mbuf.\n");
+					rte_pktmbuf_free(m);
+					alloc_err = 1;
+					break;
+				}
+
+				seg_num++;
+				prev->pkt.next = cur;
+				prev = cur;
+				seg_offset = 0;
+				seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+			} else {
+				if (desc->flags & VRING_DESC_F_NEXT) {
+					/*
+					 * There are more virtio buffers in
+					 * same vring entry need to be copied.
+					 */
+					if (seg_avail == 0) {
+						/*
+						 * The current segment hasn't
+						 * room to accomodate more
+						 * data.
+						 */
+						cur->pkt.data_len = seg_offset;
+						m->pkt.pkt_len += seg_offset;
+						/*
+						 * Allocate an mbuf and
+						 * populate the structure.
+						 */
+						cur = rte_pktmbuf_alloc(mbuf_pool);
+						if (unlikely(cur == NULL)) {
+							RTE_LOG(ERR,
+								VHOST_DATA,
+								"Failed to "
+								"allocate memory "
+								"for mbuf\n");
+							rte_pktmbuf_free(m);
+							alloc_err = 1;
+							break;
+						}
+						seg_num++;
+						prev->pkt.next = cur;
+						prev = cur;
+						seg_offset = 0;
+						seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+					}
+
+					desc = &vq->desc[desc->next];
+
+					/* Buffer address translation. */
+					vb_addr = gpa_to_vva(dev, desc->addr);
+					/* Prefetch buffer address. */
+					rte_prefetch0((void *)(uintptr_t)vb_addr);
+					vb_offset = 0;
+					vb_avail = desc->len;
+
+					VHOST_PRINT_PACKET(dev, (uintptr_t)vb_addr,
+						desc->len, 0);
+				} else {
+					/* The whole packet completes. */
+					cur->pkt.data_len = seg_offset;
+					m->pkt.pkt_len += seg_offset;
+					vb_avail = 0;
+				}
+			}
 
-		rte_memcpy((void *) mbuf->pkt.data,
-			(const void *) buff_addr, mbuf->pkt.data_len);
+			cpy_len = RTE_MIN(vb_avail, seg_avail);
+		}
 
-		pkts[packet_success] = mbuf;
+		if (unlikely(alloc_err == 1))
+			break;
 
-		VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
+		m->pkt.nb_segs = seg_num;
 
+		pkts[entry_success] = m;
 		vq->last_used_idx++;
-		packet_success++;
+		entry_success++;
 	}
 
 	rte_compiler_barrier();
-	vq->used->idx += packet_success;
+	vq->used->idx += entry_success;
 	/* Kick guest if required. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
 		eventfd_write((int)vq->kickfd, 1);
+	return entry_success;
 
-	return packet_success;
 }
-- 
1.8.1.4