DPDK patches and discussions
 help / color / Atom feed
From: Flavio Leitner <fbl@sysclose.org>
To: dev@dpdk.org
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>,
	Tiwei Bie <tiwei.bie@intel.com>,
	Zhihong Wang <zhihong.wang@intel.com>,
	Obrembski MichalX <michalx.obrembski@intel.com>,
	Stokes Ian <ian.stokes@intel.com>
Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
Date: Tue,  1 Oct 2019 19:19:35 -0300
Message-ID: <20191001221935.12140-1-fbl@sysclose.org> (raw)

The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
the data fits into a buffer, then all data is copied and a single
linear buffer is returned. Otherwise it allocates additional mbufs
and chains them together to return a multiple segments mbuf.

While that covers most use cases, it forces applications that need
to work with larger data sizes to support multiple segments mbufs.
The non-linear characteristic brings complexity and performance
implications to the application.

To resolve the issue, change the API so that the application can
optionally provide a second mempool containing larger mbufs. If that
is not provided (NULL), the behavior remains as before the change.
Otherwise, the data size is checked and the corresponding mempool
is used to return linear mbufs.

Signed-off-by: Flavio Leitner <fbl@sysclose.org>
---
 drivers/net/vhost/rte_eth_vhost.c |  4 +--
 examples/tep_termination/main.c   |  2 +-
 examples/vhost/main.c             |  2 +-
 lib/librte_vhost/rte_vhost.h      |  5 ++-
 lib/librte_vhost/virtio_net.c     | 57 +++++++++++++++++++++++--------
 5 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c
index 46f01a7f4..ce7f68a5b 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 						 VHOST_MAX_PKT_BURST);
 
 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
-						  r->mb_pool, &bufs[nb_rx],
-						  num);
+						  r->mb_pool, NULL,
+						  &bufs[nb_rx], num);
 
 		nb_rx += nb_pkts;
 		nb_receive -= nb_pkts;
diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index ab956ad7c..3ebf0fa6e 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg)
 			if (likely(!vdev->remove)) {
 				/* Handle guest TX*/
 				tx_count = rte_vhost_dequeue_burst(vdev->vid,
-						VIRTIO_TXQ, mbuf_pool,
+						VIRTIO_TXQ, mbuf_pool, NULL,
 						pkts_burst, MAX_PKT_BURST);
 				/* If this is the first received packet we need to learn the MAC */
 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index ab649bf14..e9b306af3 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
 					pkts, MAX_PKT_BURST);
 	} else {
 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
+					mbuf_pool, NULL, pkts, MAX_PKT_BURST);
 	}
 
 	/* setup VMDq for the first packet */
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b05fd8e2a 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
  *  virtio queue index in mq case
  * @param mbuf_pool
  *  mbuf_pool where host mbuf is allocated.
+ * @param mbuf_pool_large
+ *  mbuf_pool where larger host mbuf is allocated.
  * @param pkts
  *  array to contain packets to be dequeued
  * @param count
@@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
  *  num of packets dequeued
  */
 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count);
 
 /**
  * Get guest mem table: a list of memory regions.
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..da9d77732 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq)
 
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count)
 {
 	uint16_t i;
 	uint16_t free_entries;
+	uint16_t mbuf_avail;
 
 	if (unlikely(dev->dequeue_zero_copy)) {
 		struct zcopy_mbuf *zmbuf, *next;
@@ -1340,32 +1342,42 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
 			dev->vid, count);
 
+	/* If the large mpool is provided, find the threshold */
+	mbuf_avail = 0;
+	if (mbuf_pool_large)
+		mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) - RTE_PKTMBUF_HEADROOM;
+
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t head_idx;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t nr_vec = 0;
+		struct rte_mempool *mpool;
 		int err;
 
 		if (unlikely(fill_vec_buf_split(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len,
+						&head_idx, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))
 			update_shadow_used_ring_split(vq, head_idx, 0);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+		if (mbuf_pool_large && buf_len > mbuf_avail)
+			mpool = mbuf_pool_large;
+		else
+			mpool = mbuf_pool;
+
+		pkts[i] = rte_pktmbuf_alloc(mpool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
 
-		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
-				mbuf_pool);
+		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], mpool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
@@ -1411,9 +1423,11 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 static __rte_noinline uint16_t
 virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count)
 {
 	uint16_t i;
+	uint16_t mbuf_avail;
 
 	if (unlikely(dev->dequeue_zero_copy)) {
 		struct zcopy_mbuf *zmbuf, *next;
@@ -1448,17 +1462,23 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
 			dev->vid, count);
 
+	/* If the large mpool is provided, find the threshold */
+	mbuf_avail = 0;
+	if (mbuf_pool_large)
+		mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) - RTE_PKTMBUF_HEADROOM;
+
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t buf_id;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t desc_count, nr_vec = 0;
+		struct rte_mempool *mpool;
 		int err;
 
 		if (unlikely(fill_vec_buf_packed(dev, vq,
 						vq->last_avail_idx, &desc_count,
 						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
+						&buf_id, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
@@ -1466,15 +1486,19 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			update_shadow_used_ring_packed(vq, buf_id, 0,
 					desc_count);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+		if (mbuf_pool_large && buf_len > mbuf_avail)
+			mpool = mbuf_pool_large;
+		else
+			mpool = mbuf_pool;
+
+		pkts[i] = rte_pktmbuf_alloc(mpool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
 
-		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
-				mbuf_pool);
+		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], mpool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
@@ -1526,7 +1550,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 uint16_t
 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev;
 	struct rte_mbuf *rarp_mbuf = NULL;
@@ -1598,9 +1623,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	}
 
 	if (vq_is_packed(dev))
-		count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
+		count = virtio_dev_tx_packed(dev, vq, mbuf_pool, mbuf_pool_large, pkts,
+                                     count);
 	else
-		count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
+		count = virtio_dev_tx_split(dev, vq, mbuf_pool, mbuf_pool_large, pkts,
+                                    count);
 
 out:
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
-- 
2.20.1


             reply index

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-01 22:19 Flavio Leitner [this message]
2019-10-01 23:10 ` Flavio Leitner
2019-10-02  4:45 ` Shahaf Shuler
2019-10-02  8:04   ` David Marchand
2019-10-02  9:00     ` Shahaf Shuler
2019-10-02 12:58       ` Flavio Leitner
2019-10-02 17:50         ` Shahaf Shuler
2019-10-02 18:15           ` Flavio Leitner
2019-10-03 16:57             ` Ilya Maximets
2019-10-03 21:25               ` Flavio Leitner
2019-10-02  7:51 ` Maxime Coquelin
2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
2019-10-06  4:47   ` Shahaf Shuler
2019-10-10  5:12   ` Tiwei Bie
2019-10-10 12:12     ` Flavio Leitner
2019-10-11 17:09   ` [dpdk-dev] [PATCH v3] " Flavio Leitner
2019-10-14  2:44     ` Tiwei Bie
2019-10-15 16:17     ` [dpdk-dev] [PATCH v4] " Flavio Leitner
2019-10-15 17:41       ` Ilya Maximets
2019-10-15 18:44         ` Flavio Leitner
2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
2019-10-16 10:02         ` Maxime Coquelin
2019-10-16 11:13         ` Maxime Coquelin
2019-10-16 13:32           ` Ilya Maximets
2019-10-16 13:46             ` Maxime Coquelin
2019-10-16 14:02               ` Flavio Leitner
2019-10-16 14:08                 ` Ilya Maximets
2019-10-16 14:14                   ` Flavio Leitner
2019-10-16 14:05               ` Ilya Maximets
2019-10-29  9:02         ` David Marchand
2019-10-29 12:21           ` Flavio Leitner
2019-10-29 16:19             ` David Marchand

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191001221935.12140-1-fbl@sysclose.org \
    --to=fbl@sysclose.org \
    --cc=dev@dpdk.org \
    --cc=ian.stokes@intel.com \
    --cc=maxime.coquelin@redhat.com \
    --cc=michalx.obrembski@intel.com \
    --cc=tiwei.bie@intel.com \
    --cc=zhihong.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

DPDK patches and discussions

Archives are clonable:
	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev


Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox