DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
@ 2019-10-01 22:19 Flavio Leitner
  2019-10-01 23:10 ` Flavio Leitner
                   ` (3 more replies)
  0 siblings, 4 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-01 22:19 UTC (permalink / raw)
  To: dev
  Cc: Maxime Coquelin, Tiwei Bie, Zhihong Wang, Obrembski MichalX, Stokes Ian

The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
the data fits into a buffer, then all data is copied and a single
linear buffer is returned. Otherwise it allocates additional mbufs
and chains them together to return a multiple segments mbuf.

While that covers most use cases, it forces applications that need
to work with larger data sizes to support multiple segments mbufs.
The non-linear characteristic brings complexity and performance
implications to the application.

To resolve the issue, change the API so that the application can
optionally provide a second mempool containing larger mbufs. If that
is not provided (NULL), the behavior remains as before the change.
Otherwise, the data size is checked and the corresponding mempool
is used to return linear mbufs.

Signed-off-by: Flavio Leitner <fbl@sysclose.org>
---
 drivers/net/vhost/rte_eth_vhost.c |  4 +--
 examples/tep_termination/main.c   |  2 +-
 examples/vhost/main.c             |  2 +-
 lib/librte_vhost/rte_vhost.h      |  5 ++-
 lib/librte_vhost/virtio_net.c     | 57 +++++++++++++++++++++++--------
 5 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c
index 46f01a7f4..ce7f68a5b 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 						 VHOST_MAX_PKT_BURST);
 
 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
-						  r->mb_pool, &bufs[nb_rx],
-						  num);
+						  r->mb_pool, NULL,
+						  &bufs[nb_rx], num);
 
 		nb_rx += nb_pkts;
 		nb_receive -= nb_pkts;
diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index ab956ad7c..3ebf0fa6e 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg)
 			if (likely(!vdev->remove)) {
 				/* Handle guest TX*/
 				tx_count = rte_vhost_dequeue_burst(vdev->vid,
-						VIRTIO_TXQ, mbuf_pool,
+						VIRTIO_TXQ, mbuf_pool, NULL,
 						pkts_burst, MAX_PKT_BURST);
 				/* If this is the first received packet we need to learn the MAC */
 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index ab649bf14..e9b306af3 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
 					pkts, MAX_PKT_BURST);
 	} else {
 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-					mbuf_pool, pkts, MAX_PKT_BURST);
+					mbuf_pool, NULL, pkts, MAX_PKT_BURST);
 	}
 
 	/* setup VMDq for the first packet */
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b05fd8e2a 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
  *  virtio queue index in mq case
  * @param mbuf_pool
  *  mbuf_pool where host mbuf is allocated.
+ * @param mbuf_pool_large
+ *  mbuf_pool where larger host mbuf is allocated.
  * @param pkts
  *  array to contain packets to be dequeued
  * @param count
@@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
  *  num of packets dequeued
  */
 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count);
 
 /**
  * Get guest mem table: a list of memory regions.
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..da9d77732 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq)
 
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count)
 {
 	uint16_t i;
 	uint16_t free_entries;
+	uint16_t mbuf_avail;
 
 	if (unlikely(dev->dequeue_zero_copy)) {
 		struct zcopy_mbuf *zmbuf, *next;
@@ -1340,32 +1342,42 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
 			dev->vid, count);
 
+	/* If the large mpool is provided, find the threshold */
+	mbuf_avail = 0;
+	if (mbuf_pool_large)
+		mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) - RTE_PKTMBUF_HEADROOM;
+
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t head_idx;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t nr_vec = 0;
+		struct rte_mempool *mpool;
 		int err;
 
 		if (unlikely(fill_vec_buf_split(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len,
+						&head_idx, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))
 			update_shadow_used_ring_split(vq, head_idx, 0);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+		if (mbuf_pool_large && buf_len > mbuf_avail)
+			mpool = mbuf_pool_large;
+		else
+			mpool = mbuf_pool;
+
+		pkts[i] = rte_pktmbuf_alloc(mpool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
 
-		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
-				mbuf_pool);
+		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], mpool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
@@ -1411,9 +1423,11 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 static __rte_noinline uint16_t
 virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count)
 {
 	uint16_t i;
+	uint16_t mbuf_avail;
 
 	if (unlikely(dev->dequeue_zero_copy)) {
 		struct zcopy_mbuf *zmbuf, *next;
@@ -1448,17 +1462,23 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
 			dev->vid, count);
 
+	/* If the large mpool is provided, find the threshold */
+	mbuf_avail = 0;
+	if (mbuf_pool_large)
+		mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) - RTE_PKTMBUF_HEADROOM;
+
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t buf_id;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t desc_count, nr_vec = 0;
+		struct rte_mempool *mpool;
 		int err;
 
 		if (unlikely(fill_vec_buf_packed(dev, vq,
 						vq->last_avail_idx, &desc_count,
 						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
+						&buf_id, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
@@ -1466,15 +1486,19 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			update_shadow_used_ring_packed(vq, buf_id, 0,
 					desc_count);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+		if (mbuf_pool_large && buf_len > mbuf_avail)
+			mpool = mbuf_pool_large;
+		else
+			mpool = mbuf_pool;
+
+		pkts[i] = rte_pktmbuf_alloc(mpool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
 
-		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
-				mbuf_pool);
+		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], mpool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
@@ -1526,7 +1550,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 uint16_t
 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+	struct rte_mbuf **pkts, uint16_t count)
 {
 	struct virtio_net *dev;
 	struct rte_mbuf *rarp_mbuf = NULL;
@@ -1598,9 +1623,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	}
 
 	if (vq_is_packed(dev))
-		count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
+		count = virtio_dev_tx_packed(dev, vq, mbuf_pool, mbuf_pool_large, pkts,
+                                     count);
 	else
-		count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
+		count = virtio_dev_tx_split(dev, vq, mbuf_pool, mbuf_pool_large, pkts,
+                                    count);
 
 out:
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
-- 
2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-01 22:19 [dpdk-dev] [PATCH] vhost: add support to large linear mbufs Flavio Leitner
@ 2019-10-01 23:10 ` Flavio Leitner
  2019-10-02  4:45 ` Shahaf Shuler
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-01 23:10 UTC (permalink / raw)
  To: dev
  Cc: Maxime Coquelin, Tiwei Bie, Zhihong Wang, Obrembski MichalX, Stokes Ian

On Tue,  1 Oct 2019 19:19:35 -0300
Flavio Leitner <fbl@sysclose.org> wrote:

> The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
> the data fits into a buffer, then all data is copied and a single
> linear buffer is returned. Otherwise it allocates additional mbufs
> and chains them together to return a multiple segments mbuf.
> 
> While that covers most use cases, it forces applications that need
> to work with larger data sizes to support multiple segments mbufs.
> The non-linear characteristic brings complexity and performance
> implications to the application.
> 
> To resolve the issue, change the API so that the application can
> optionally provide a second mempool containing larger mbufs. If that
> is not provided (NULL), the behavior remains as before the change.
> Otherwise, the data size is checked and the corresponding mempool
> is used to return linear mbufs.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>

The use case behind the patch is TSO support in DPDK accelerated Open
vSwitch (customers are using ovs-dpdk to send big packets too!). The
effort has been going on for many months now. There is a recent
proposal in OvS dev mailing using the API as is today here:
https://mail.openvswitch.org/pipermail/ovs-dev/2019-September/362573.html

There is also a reply from me in that thread with the patches
implementing a Proof-Of-Concept with the proposed API change:
https://mail.openvswitch.org/pipermail/ovs-dev/2019-September/362881.html

I opted to improve the current API instead of adding a new one, but I
have no strong opinion either way.

I know it's a bit late for v19.11, but since the TSO support effort is
going on for months now and OvS uses only LTS, I wonder if this can be
part of v19.11.

Thanks,
fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-01 22:19 [dpdk-dev] [PATCH] vhost: add support to large linear mbufs Flavio Leitner
  2019-10-01 23:10 ` Flavio Leitner
@ 2019-10-02  4:45 ` Shahaf Shuler
  2019-10-02  8:04   ` David Marchand
  2019-10-02  7:51 ` Maxime Coquelin
  2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
  3 siblings, 1 reply; 32+ messages in thread
From: Shahaf Shuler @ 2019-10-02  4:45 UTC (permalink / raw)
  To: Flavio Leitner, dev
  Cc: Maxime Coquelin, Tiwei Bie, Zhihong Wang, Obrembski MichalX, Stokes Ian

Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:
> Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
> 
> The rte_vhost_dequeue_burst supports two ways of dequeuing data. If the
> data fits into a buffer, then all data is copied and a single linear buffer is
> returned. Otherwise it allocates additional mbufs and chains them together
> to return a multiple segments mbuf.
> 
> While that covers most use cases, it forces applications that need to work
> with larger data sizes to support multiple segments mbufs.
> The non-linear characteristic brings complexity and performance implications
> to the application.
> 
> To resolve the issue, change the API so that the application can optionally
> provide a second mempool containing larger mbufs. If that is not provided
> (NULL), the behavior remains as before the change.
> Otherwise, the data size is checked and the corresponding mempool is used
> to return linear mbufs.

I understand the motivation. 
However, providing a static pool w/ large buffers is not so efficient in terms of memory footprint. You will need to prepare to worst case (all packet are large) w/ max size of 64KB. 
Also, the two mempools are quite restrictive as the memory fill of the mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K , mempool2 mbuf.size = 64K, packet size 4KB. 

Instead, how about using the mbuf external buffer feature? 
The flow will be:
1. vhost PMD always receive a single mempool (like today)
2. on dequeue, PMD looks on the virtio packet size. If smaller than the mbuf size use the mbuf as is (like today)
3. otherwise, allocate a new buffer (inside the PMD) and link it to the mbuf as external buffer (rte_pktmbuf_attach_extbuf)

The pros of this approach is that you have full flexibility on the memory allocation, and therefore a lower footprint.
The cons is the OVS will need to know how to handle mbuf w/ external buffers (not too complex IMO).  

> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>  drivers/net/vhost/rte_eth_vhost.c |  4 +--
>  examples/tep_termination/main.c   |  2 +-
>  examples/vhost/main.c             |  2 +-
>  lib/librte_vhost/rte_vhost.h      |  5 ++-
>  lib/librte_vhost/virtio_net.c     | 57 +++++++++++++++++++++++--------
>  5 files changed, 50 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/net/vhost/rte_eth_vhost.c
> b/drivers/net/vhost/rte_eth_vhost.c
> index 46f01a7f4..ce7f68a5b 100644
> --- a/drivers/net/vhost/rte_eth_vhost.c
> +++ b/drivers/net/vhost/rte_eth_vhost.c
> @@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs,
> uint16_t nb_bufs)
>  						 VHOST_MAX_PKT_BURST);
> 
>  		nb_pkts = rte_vhost_dequeue_burst(r->vid, r-
> >virtqueue_id,
> -						  r->mb_pool, &bufs[nb_rx],
> -						  num);
> +						  r->mb_pool, NULL,
> +						  &bufs[nb_rx], num);
> 
>  		nb_rx += nb_pkts;
>  		nb_receive -= nb_pkts;
> diff --git a/examples/tep_termination/main.c
> b/examples/tep_termination/main.c index ab956ad7c..3ebf0fa6e 100644
> --- a/examples/tep_termination/main.c
> +++ b/examples/tep_termination/main.c
> @@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg)
>  			if (likely(!vdev->remove)) {
>  				/* Handle guest TX*/
>  				tx_count = rte_vhost_dequeue_burst(vdev-
> >vid,
> -						VIRTIO_TXQ, mbuf_pool,
> +						VIRTIO_TXQ, mbuf_pool,
> NULL,
>  						pkts_burst,
> MAX_PKT_BURST);
>  				/* If this is the first received packet we need
> to learn the MAC */
>  				if (unlikely(vdev->ready ==
> DEVICE_MAC_LEARNING) && tx_count) { diff --git a/examples/vhost/main.c
> b/examples/vhost/main.c index ab649bf14..e9b306af3 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
>  					pkts, MAX_PKT_BURST);
>  	} else {
>  		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
> -					mbuf_pool, pkts, MAX_PKT_BURST);
> +					mbuf_pool, NULL, pkts,
> MAX_PKT_BURST);
>  	}
> 
>  	/* setup VMDq for the first packet */
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index
> 19474bca0..b05fd8e2a 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t
> queue_id,
>   *  virtio queue index in mq case
>   * @param mbuf_pool
>   *  mbuf_pool where host mbuf is allocated.
> + * @param mbuf_pool_large
> + *  mbuf_pool where larger host mbuf is allocated.
>   * @param pkts
>   *  array to contain packets to be dequeued
>   * @param count
> @@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t
> queue_id,
>   *  num of packets dequeued
>   */
>  uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> -	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count);
> +	struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +	struct rte_mbuf **pkts, uint16_t count);
> 
>  /**
>   * Get guest mem table: a list of memory regions.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index
> 5b85b832d..da9d77732 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq)
> 
>  static __rte_noinline uint16_t
>  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> +	struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +	struct rte_mbuf **pkts, uint16_t count)
>  {
>  	uint16_t i;
>  	uint16_t free_entries;
> +	uint16_t mbuf_avail;
> 
>  	if (unlikely(dev->dequeue_zero_copy)) {
>  		struct zcopy_mbuf *zmbuf, *next;
> @@ -1340,32 +1342,42 @@ virtio_dev_tx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>  	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u
> buffers\n",
>  			dev->vid, count);
> 
> +	/* If the large mpool is provided, find the threshold */
> +	mbuf_avail = 0;
> +	if (mbuf_pool_large)
> +		mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) -
> +RTE_PKTMBUF_HEADROOM;
> +
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t head_idx;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t nr_vec = 0;
> +		struct rte_mempool *mpool;
>  		int err;
> 
>  		if (unlikely(fill_vec_buf_split(dev, vq,
>  						vq->last_avail_idx + i,
>  						&nr_vec, buf_vec,
> -						&head_idx, &dummy_len,
> +						&head_idx, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
> 
>  		if (likely(dev->dequeue_zero_copy == 0))
>  			update_shadow_used_ring_split(vq, head_idx, 0);
> 
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +		if (mbuf_pool_large && buf_len > mbuf_avail)
> +			mpool = mbuf_pool_large;
> +		else
> +			mpool = mbuf_pool;
> +
> +		pkts[i] = rte_pktmbuf_alloc(mpool);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
>  			break;
>  		}
> 
> -		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> -				mbuf_pool);
> +		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> mpool);
>  		if (unlikely(err)) {
>  			rte_pktmbuf_free(pkts[i]);
>  			break;
> @@ -1411,9 +1423,11 @@ virtio_dev_tx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> 
>  static __rte_noinline uint16_t
>  virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> +	struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +	struct rte_mbuf **pkts, uint16_t count)
>  {
>  	uint16_t i;
> +	uint16_t mbuf_avail;
> 
>  	if (unlikely(dev->dequeue_zero_copy)) {
>  		struct zcopy_mbuf *zmbuf, *next;
> @@ -1448,17 +1462,23 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>  	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u
> buffers\n",
>  			dev->vid, count);
> 
> +	/* If the large mpool is provided, find the threshold */
> +	mbuf_avail = 0;
> +	if (mbuf_pool_large)
> +		mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) -
> +RTE_PKTMBUF_HEADROOM;
> +
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t buf_id;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t desc_count, nr_vec = 0;
> +		struct rte_mempool *mpool;
>  		int err;
> 
>  		if (unlikely(fill_vec_buf_packed(dev, vq,
>  						vq->last_avail_idx,
> &desc_count,
>  						buf_vec, &nr_vec,
> -						&buf_id, &dummy_len,
> +						&buf_id, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
> 
> @@ -1466,15 +1486,19 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>  			update_shadow_used_ring_packed(vq, buf_id, 0,
>  					desc_count);
> 
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +		if (mbuf_pool_large && buf_len > mbuf_avail)
> +			mpool = mbuf_pool_large;
> +		else
> +			mpool = mbuf_pool;
> +
> +		pkts[i] = rte_pktmbuf_alloc(mpool);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
>  			break;
>  		}
> 
> -		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> -				mbuf_pool);
> +		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> mpool);
>  		if (unlikely(err)) {
>  			rte_pktmbuf_free(pkts[i]);
>  			break;
> @@ -1526,7 +1550,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> 
>  uint16_t
>  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> -	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> +	struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +	struct rte_mbuf **pkts, uint16_t count)
>  {
>  	struct virtio_net *dev;
>  	struct rte_mbuf *rarp_mbuf = NULL;
> @@ -1598,9 +1623,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t
> queue_id,
>  	}
> 
>  	if (vq_is_packed(dev))
> -		count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts,
> count);
> +		count = virtio_dev_tx_packed(dev, vq, mbuf_pool,
> mbuf_pool_large, pkts,
> +                                     count);
>  	else
> -		count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
> +		count = virtio_dev_tx_split(dev, vq, mbuf_pool,
> mbuf_pool_large, pkts,
> +                                    count);
> 
>  out:
>  	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-01 22:19 [dpdk-dev] [PATCH] vhost: add support to large linear mbufs Flavio Leitner
  2019-10-01 23:10 ` Flavio Leitner
  2019-10-02  4:45 ` Shahaf Shuler
@ 2019-10-02  7:51 ` Maxime Coquelin
  2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
  3 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2019-10-02  7:51 UTC (permalink / raw)
  To: Flavio Leitner, dev
  Cc: Tiwei Bie, Zhihong Wang, Obrembski MichalX, Stokes Ian

Hi Flavio,

On 10/2/19 12:19 AM, Flavio Leitner wrote:
> The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
> the data fits into a buffer, then all data is copied and a single
> linear buffer is returned. Otherwise it allocates additional mbufs
> and chains them together to return a multiple segments mbuf.
> 
> While that covers most use cases, it forces applications that need
> to work with larger data sizes to support multiple segments mbufs.
> The non-linear characteristic brings complexity and performance
> implications to the application.
> 
> To resolve the issue, change the API so that the application can
> optionally provide a second mempool containing larger mbufs. If that
> is not provided (NULL), the behavior remains as before the change.
> Otherwise, the data size is checked and the corresponding mempool
> is used to return linear mbufs.

It is not necessary to break the API (and it would require a deprecation
notice), but instead you could create a new API:

static inline uint16_t
vhost_dequeue_burst(int vid, uint16_t queue_id,
		struct rte_mempool *mbuf_pool,
		struct rte_mempool *mbuf_pool_large,
		struct rte_mbuf **pkts, uint16_t count);

uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
				struct rte_mempool *mbuf_pool,
				struct rte_mbuf **pkts, uint16_t count)
{
	return vhost_dequeue_burst(vid, queue_id, mbuf_pool, NULL,
				pkts, count);
}

uint16_t rte_vhost_dequeue_burst2(int vid, uint16_t queue_id,
				struct rte_mempool *mbuf_pool,
				struct rte_mempool *mbuf_pool_large,
				struct rte_mbuf **pkts, uint16_t count)
{
	return vhost_dequeue_burst(vid, queue_id, mbuf_pool,
				mbuf_pool_large,
				pkts, count);
}



Yeah, the name isn't very creative, I'm sure you'll have better idea!

> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>  drivers/net/vhost/rte_eth_vhost.c |  4 +--
>  examples/tep_termination/main.c   |  2 +-
>  examples/vhost/main.c             |  2 +-
>  lib/librte_vhost/rte_vhost.h      |  5 ++-
>  lib/librte_vhost/virtio_net.c     | 57 +++++++++++++++++++++++--------
>  5 files changed, 50 insertions(+), 20 deletions(-)

Maxime

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-02  4:45 ` Shahaf Shuler
@ 2019-10-02  8:04   ` David Marchand
  2019-10-02  9:00     ` Shahaf Shuler
  0 siblings, 1 reply; 32+ messages in thread
From: David Marchand @ 2019-10-02  8:04 UTC (permalink / raw)
  To: Shahaf Shuler
  Cc: Flavio Leitner, dev, Maxime Coquelin, Tiwei Bie, Zhihong Wang,
	Obrembski MichalX, Stokes Ian

Hello Shahaf,

On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler <shahafs@mellanox.com> wrote:
>
> Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:
> > Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
> >
> > The rte_vhost_dequeue_burst supports two ways of dequeuing data. If the
> > data fits into a buffer, then all data is copied and a single linear buffer is
> > returned. Otherwise it allocates additional mbufs and chains them together
> > to return a multiple segments mbuf.
> >
> > While that covers most use cases, it forces applications that need to work
> > with larger data sizes to support multiple segments mbufs.
> > The non-linear characteristic brings complexity and performance implications
> > to the application.
> >
> > To resolve the issue, change the API so that the application can optionally
> > provide a second mempool containing larger mbufs. If that is not provided
> > (NULL), the behavior remains as before the change.
> > Otherwise, the data size is checked and the corresponding mempool is used
> > to return linear mbufs.
>
> I understand the motivation.
> However, providing a static pool w/ large buffers is not so efficient in terms of memory footprint. You will need to prepare to worst case (all packet are large) w/ max size of 64KB.
> Also, the two mempools are quite restrictive as the memory fill of the mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K , mempool2 mbuf.size = 64K, packet size 4KB.
>
> Instead, how about using the mbuf external buffer feature?
> The flow will be:
> 1. vhost PMD always receive a single mempool (like today)
> 2. on dequeue, PMD looks on the virtio packet size. If smaller than the mbuf size use the mbuf as is (like today)
> 3. otherwise, allocate a new buffer (inside the PMD) and link it to the mbuf as external buffer (rte_pktmbuf_attach_extbuf)

I am missing some piece here.
Which pool would the PMD take those external buffers from?

If it is from an additional mempool passed to the vhost pmd, I can't
see the difference with Flavio proposal.


> The pros of this approach is that you have full flexibility on the memory allocation, and therefore a lower footprint.
> The cons is the OVS will need to know how to handle mbuf w/ external buffers (not too complex IMO).


-- 
David Marchand

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-02  8:04   ` David Marchand
@ 2019-10-02  9:00     ` Shahaf Shuler
  2019-10-02 12:58       ` Flavio Leitner
  0 siblings, 1 reply; 32+ messages in thread
From: Shahaf Shuler @ 2019-10-02  9:00 UTC (permalink / raw)
  To: David Marchand
  Cc: Flavio Leitner, dev, Maxime Coquelin, Tiwei Bie, Zhihong Wang,
	Obrembski MichalX, Stokes Ian

Wednesday, October 2, 2019 11:05 AM, David Marchand:
> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
> 
> Hello Shahaf,
> 
> On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler <shahafs@mellanox.com>
> wrote:
> >
> > Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:
> > > Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
> > >
> > > The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
> > > the data fits into a buffer, then all data is copied and a single
> > > linear buffer is returned. Otherwise it allocates additional mbufs
> > > and chains them together to return a multiple segments mbuf.
> > >
> > > While that covers most use cases, it forces applications that need
> > > to work with larger data sizes to support multiple segments mbufs.
> > > The non-linear characteristic brings complexity and performance
> > > implications to the application.
> > >
> > > To resolve the issue, change the API so that the application can
> > > optionally provide a second mempool containing larger mbufs. If that
> > > is not provided (NULL), the behavior remains as before the change.
> > > Otherwise, the data size is checked and the corresponding mempool is
> > > used to return linear mbufs.
> >
> > I understand the motivation.
> > However, providing a static pool w/ large buffers is not so efficient in terms
> of memory footprint. You will need to prepare to worst case (all packet are
> large) w/ max size of 64KB.
> > Also, the two mempools are quite restrictive as the memory fill of the
> mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K , mempool2
> mbuf.size = 64K, packet size 4KB.
> >
> > Instead, how about using the mbuf external buffer feature?
> > The flow will be:
> > 1. vhost PMD always receive a single mempool (like today) 2. on
> > dequeue, PMD looks on the virtio packet size. If smaller than the mbuf
> > size use the mbuf as is (like today) 3. otherwise, allocate a new
> > buffer (inside the PMD) and link it to the mbuf as external buffer
> > (rte_pktmbuf_attach_extbuf)
> 
> I am missing some piece here.
> Which pool would the PMD take those external buffers from?

The mbuf is always taken from the single mempool associated w/ the rxq.
The buffer for the mbuf may be allocated (in case virtio payload is bigger than current mbuf size) from DPDK hugepages or any other system memory and be attached to the mbuf.

You can see example implementation of it in mlx5 PMD (checkout rte_pktmbuf_attach_extbuf call)

> 
> If it is from an additional mempool passed to the vhost pmd, I can't see the
> difference with Flavio proposal.
> 
> 
> > The pros of this approach is that you have full flexibility on the memory
> allocation, and therefore a lower footprint.
> > The cons is the OVS will need to know how to handle mbuf w/ external
> buffers (not too complex IMO).
> 
> 
> --
> David Marchand

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-02  9:00     ` Shahaf Shuler
@ 2019-10-02 12:58       ` Flavio Leitner
  2019-10-02 17:50         ` Shahaf Shuler
  0 siblings, 1 reply; 32+ messages in thread
From: Flavio Leitner @ 2019-10-02 12:58 UTC (permalink / raw)
  To: Shahaf Shuler
  Cc: David Marchand, dev, Maxime Coquelin, Tiwei Bie, Zhihong Wang,
	Obrembski MichalX, Stokes Ian


Hi Shahaf,

Thanks for looking into this, see my inline comments.

On Wed, 2 Oct 2019 09:00:11 +0000
Shahaf Shuler <shahafs@mellanox.com> wrote:

> Wednesday, October 2, 2019 11:05 AM, David Marchand:
> > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> > mbufs
> > 
> > Hello Shahaf,
> > 
> > On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler <shahafs@mellanox.com>
> > wrote:  
> > >
> > > Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:  
> > > > Subject: [dpdk-dev] [PATCH] vhost: add support to large linear
> > > > mbufs
> > > >
> > > > The rte_vhost_dequeue_burst supports two ways of dequeuing
> > > > data. If the data fits into a buffer, then all data is copied
> > > > and a single linear buffer is returned. Otherwise it allocates
> > > > additional mbufs and chains them together to return a multiple
> > > > segments mbuf.
> > > >
> > > > While that covers most use cases, it forces applications that
> > > > need to work with larger data sizes to support multiple
> > > > segments mbufs. The non-linear characteristic brings complexity
> > > > and performance implications to the application.
> > > >
> > > > To resolve the issue, change the API so that the application can
> > > > optionally provide a second mempool containing larger mbufs. If
> > > > that is not provided (NULL), the behavior remains as before the
> > > > change. Otherwise, the data size is checked and the
> > > > corresponding mempool is used to return linear mbufs.  
> > >
> > > I understand the motivation.
> > > However, providing a static pool w/ large buffers is not so
> > > efficient in terms  
> > of memory footprint. You will need to prepare to worst case (all
> > packet are large) w/ max size of 64KB.  
> > > Also, the two mempools are quite restrictive as the memory fill
> > > of the  
> > mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K ,
> > mempool2 mbuf.size = 64K, packet size 4KB.  
> > >
> > > Instead, how about using the mbuf external buffer feature?
> > > The flow will be:
> > > 1. vhost PMD always receive a single mempool (like today) 2. on
> > > dequeue, PMD looks on the virtio packet size. If smaller than the
> > > mbuf size use the mbuf as is (like today) 3. otherwise, allocate
> > > a new buffer (inside the PMD) and link it to the mbuf as external
> > > buffer (rte_pktmbuf_attach_extbuf)  
> > 
> > I am missing some piece here.
> > Which pool would the PMD take those external buffers from?  
> 
> The mbuf is always taken from the single mempool associated w/ the
> rxq. The buffer for the mbuf may be allocated (in case virtio payload
> is bigger than current mbuf size) from DPDK hugepages or any other
> system memory and be attached to the mbuf.
> 
> You can see example implementation of it in mlx5 PMD (checkout
> rte_pktmbuf_attach_extbuf call)

Thanks, I wasn't aware of external buffers.

I see that attaching external buffers of the correct size would be more
efficient in terms of saving memory/avoiding sparsing.

However, we still need to be prepared to the worse case scenario (all
packets 64K), so that doesn't help with the total memory required.

The current patch pushes the decision to the application which knows
better the workload. If more memory is available, it can optionally use
large buffers, otherwise just don't pass that. Or even decide whether
to share the same 64K mempool between multiple vhost ports or use one
mempool per port.

Perhaps I missed something, but managing memory with mempool still
require us to have buffers of 64K regardless if the data consumes less
space. Otherwise the application or the PMD will have to manage memory
itself.

If we let the PMD manages the memory, what happens if a port/queue
is closed and one or more buffers are still in use (switching)? I don't
see how to solve this cleanly.

fbl

> 
> > 
> > If it is from an additional mempool passed to the vhost pmd, I
> > can't see the difference with Flavio proposal.
> > 
> >   
> > > The pros of this approach is that you have full flexibility on
> > > the memory  
> > allocation, and therefore a lower footprint.  
> > > The cons is the OVS will need to know how to handle mbuf w/
> > > external  
> > buffers (not too complex IMO).
> > 
> > 
> > --
> > David Marchand  


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-02 12:58       ` Flavio Leitner
@ 2019-10-02 17:50         ` Shahaf Shuler
  2019-10-02 18:15           ` Flavio Leitner
  0 siblings, 1 reply; 32+ messages in thread
From: Shahaf Shuler @ 2019-10-02 17:50 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: David Marchand, dev, Maxime Coquelin, Tiwei Bie, Zhihong Wang,
	Obrembski MichalX, Stokes Ian

Wednesday, October 2, 2019 3:59 PM, Flavio Leitner:
> Obrembski MichalX <michalx.obrembski@intel.com>; Stokes Ian
> <ian.stokes@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
> 
> 
> Hi Shahaf,
> 
> Thanks for looking into this, see my inline comments.
> 
> On Wed, 2 Oct 2019 09:00:11 +0000
> Shahaf Shuler <shahafs@mellanox.com> wrote:
> 
> > Wednesday, October 2, 2019 11:05 AM, David Marchand:
> > > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> > > mbufs
> > >
> > > Hello Shahaf,
> > >
> > > On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler <shahafs@mellanox.com>
> > > wrote:
> > > >

[...]

> > >
> > > I am missing some piece here.
> > > Which pool would the PMD take those external buffers from?
> >
> > The mbuf is always taken from the single mempool associated w/ the
> > rxq. The buffer for the mbuf may be allocated (in case virtio payload
> > is bigger than current mbuf size) from DPDK hugepages or any other
> > system memory and be attached to the mbuf.
> >
> > You can see example implementation of it in mlx5 PMD (checkout
> > rte_pktmbuf_attach_extbuf call)
> 
> Thanks, I wasn't aware of external buffers.
> 
> I see that attaching external buffers of the correct size would be more
> efficient in terms of saving memory/avoiding sparsing.
> 
> However, we still need to be prepared to the worse case scenario (all
> packets 64K), so that doesn't help with the total memory required.

Am not sure why. 
The allocation can be per demand. That is - only when you encounter a large buffer. 

Having buffer allocated in advance will benefit only from removing the cost of the rte_*malloc. However on such big buffers, and further more w/ device offloads like TSO, am not sure that is an issue. 

> 
> The current patch pushes the decision to the application which knows better
> the workload. If more memory is available, it can optionally use large buffers,
> otherwise just don't pass that. Or even decide whether to share the same
> 64K mempool between multiple vhost ports or use one mempool per port.
> 
> Perhaps I missed something, but managing memory with mempool still
> require us to have buffers of 64K regardless if the data consumes less space.
> Otherwise the application or the PMD will have to manage memory itself.
> 
> If we let the PMD manages the memory, what happens if a port/queue is
> closed and one or more buffers are still in use (switching)? I don't see how to
> solve this cleanly.

Closing of the dev should return EBUSY till all buffers are free. 
What is the use case of closing a port while still having packet pending on other port of the switch? And why we cannot wait for them to complete transmission? 

> 
> fbl
> 
> >
> > >
> > > If it is from an additional mempool passed to the vhost pmd, I can't
> > > see the difference with Flavio proposal.
> > >
> > >
> > > > The pros of this approach is that you have full flexibility on the
> > > > memory
> > > allocation, and therefore a lower footprint.
> > > > The cons is the OVS will need to know how to handle mbuf w/
> > > > external
> > > buffers (not too complex IMO).
> > >
> > >
> > > --
> > > David Marchand


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-02 17:50         ` Shahaf Shuler
@ 2019-10-02 18:15           ` Flavio Leitner
  2019-10-03 16:57             ` Ilya Maximets
  0 siblings, 1 reply; 32+ messages in thread
From: Flavio Leitner @ 2019-10-02 18:15 UTC (permalink / raw)
  To: Shahaf Shuler
  Cc: David Marchand, dev, Maxime Coquelin, Tiwei Bie, Zhihong Wang,
	Obrembski MichalX, Stokes Ian

On Wed, 2 Oct 2019 17:50:41 +0000
Shahaf Shuler <shahafs@mellanox.com> wrote:

> Wednesday, October 2, 2019 3:59 PM, Flavio Leitner:
> > Obrembski MichalX <michalx.obrembski@intel.com>; Stokes Ian
> > <ian.stokes@intel.com>
> > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> > mbufs
> > 
> > 
> > Hi Shahaf,
> > 
> > Thanks for looking into this, see my inline comments.
> > 
> > On Wed, 2 Oct 2019 09:00:11 +0000
> > Shahaf Shuler <shahafs@mellanox.com> wrote:
> >   
> > > Wednesday, October 2, 2019 11:05 AM, David Marchand:  
> > > > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large
> > > > linear mbufs
> > > >
> > > > Hello Shahaf,
> > > >
> > > > On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler
> > > > <shahafs@mellanox.com> wrote:  
> > > > >  
> 
> [...]
> 
> > > >
> > > > I am missing some piece here.
> > > > Which pool would the PMD take those external buffers from?  
> > >
> > > The mbuf is always taken from the single mempool associated w/ the
> > > rxq. The buffer for the mbuf may be allocated (in case virtio
> > > payload is bigger than current mbuf size) from DPDK hugepages or
> > > any other system memory and be attached to the mbuf.
> > >
> > > You can see example implementation of it in mlx5 PMD (checkout
> > > rte_pktmbuf_attach_extbuf call)  
> > 
> > Thanks, I wasn't aware of external buffers.
> > 
> > I see that attaching external buffers of the correct size would be
> > more efficient in terms of saving memory/avoiding sparsing.
> > 
> > However, we still need to be prepared to the worse case scenario
> > (all packets 64K), so that doesn't help with the total memory
> > required.  
> 
> Am not sure why. 
> The allocation can be per demand. That is - only when you encounter a
> large buffer. 
> 
> Having buffer allocated in advance will benefit only from removing
> the cost of the rte_*malloc. However on such big buffers, and further
> more w/ device offloads like TSO, am not sure that is an issue. 

Now I see what you're saying. I was thinking we had to reserve the
memory before, like mempool does, then get the buffers as needed.

OK, I can give a try with rte_*malloc and see how it goes.

> > The current patch pushes the decision to the application which
> > knows better the workload. If more memory is available, it can
> > optionally use large buffers, otherwise just don't pass that. Or
> > even decide whether to share the same 64K mempool between multiple
> > vhost ports or use one mempool per port.
> > 
> > Perhaps I missed something, but managing memory with mempool still
> > require us to have buffers of 64K regardless if the data consumes
> > less space. Otherwise the application or the PMD will have to
> > manage memory itself.
> > 
> > If we let the PMD manages the memory, what happens if a port/queue
> > is closed and one or more buffers are still in use (switching)? I
> > don't see how to solve this cleanly.  
> 
> Closing of the dev should return EBUSY till all buffers are free. 
> What is the use case of closing a port while still having packet
> pending on other port of the switch? And why we cannot wait for them
> to complete transmission?

The vswitch gets the request from outside and the assumption is that
the command will succeed. AFAIK, there is no retry mechanism.

Thanks Shahaf!
fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-02 18:15           ` Flavio Leitner
@ 2019-10-03 16:57             ` Ilya Maximets
  2019-10-03 21:25               ` Flavio Leitner
  0 siblings, 1 reply; 32+ messages in thread
From: Ilya Maximets @ 2019-10-03 16:57 UTC (permalink / raw)
  To: Flavio Leitner, Maxime Coquelin
  Cc: Shahaf Shuler, David Marchand, dev, Tiwei Bie, Zhihong Wang,
	Obrembski MichalX, Stokes Ian, Ilya Maximets

On 02.10.2019 20:15, Flavio Leitner wrote:
> On Wed, 2 Oct 2019 17:50:41 +0000
> Shahaf Shuler <shahafs@mellanox.com> wrote:
> 
>> Wednesday, October 2, 2019 3:59 PM, Flavio Leitner:
>>> Obrembski MichalX <michalx.obrembski@intel.com>; Stokes Ian
>>> <ian.stokes@intel.com>
>>> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
>>> mbufs
>>>
>>>
>>> Hi Shahaf,
>>>
>>> Thanks for looking into this, see my inline comments.
>>>
>>> On Wed, 2 Oct 2019 09:00:11 +0000
>>> Shahaf Shuler <shahafs@mellanox.com> wrote:
>>>    
>>>> Wednesday, October 2, 2019 11:05 AM, David Marchand:
>>>>> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large
>>>>> linear mbufs
>>>>>
>>>>> Hello Shahaf,
>>>>>
>>>>> On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler
>>>>> <shahafs@mellanox.com> wrote:
>>>>>>   
>>
>> [...]
>>
>>>>>
>>>>> I am missing some piece here.
>>>>> Which pool would the PMD take those external buffers from?
>>>>
>>>> The mbuf is always taken from the single mempool associated w/ the
>>>> rxq. The buffer for the mbuf may be allocated (in case virtio
>>>> payload is bigger than current mbuf size) from DPDK hugepages or
>>>> any other system memory and be attached to the mbuf.
>>>>
>>>> You can see example implementation of it in mlx5 PMD (checkout
>>>> rte_pktmbuf_attach_extbuf call)
>>>
>>> Thanks, I wasn't aware of external buffers.
>>>
>>> I see that attaching external buffers of the correct size would be
>>> more efficient in terms of saving memory/avoiding sparsing.
>>>
>>> However, we still need to be prepared to the worse case scenario
>>> (all packets 64K), so that doesn't help with the total memory
>>> required.
>>
>> Am not sure why.
>> The allocation can be per demand. That is - only when you encounter a
>> large buffer.
>>
>> Having buffer allocated in advance will benefit only from removing
>> the cost of the rte_*malloc. However on such big buffers, and further
>> more w/ device offloads like TSO, am not sure that is an issue.
> 
> Now I see what you're saying. I was thinking we had to reserve the
> memory before, like mempool does, then get the buffers as needed.
> 
> OK, I can give a try with rte_*malloc and see how it goes.

This way we actually could have a nice API.  For example, by
introducing some new flag RTE_VHOST_USER_NO_CHAINED_MBUFS (there
might be better name) which could be passed to driver_register().
On receive, depending on this flag, function will create chained
mbufs or allocate new contiguous memory chunk and attach it as
an external buffer if the data could not be stored in a single
mbuf from the registered memory pool.

Supporting external memory in mbufs will require some additional
work from the OVS side (e.g. better work with ol_flags), but
we'll have to do it anyway for upgrade to DPDK 19.11.

Best regards, Ilya Maximets.

> 
>>> The current patch pushes the decision to the application which
>>> knows better the workload. If more memory is available, it can
>>> optionally use large buffers, otherwise just don't pass that. Or
>>> even decide whether to share the same 64K mempool between multiple
>>> vhost ports or use one mempool per port.
>>>
>>> Perhaps I missed something, but managing memory with mempool still
>>> require us to have buffers of 64K regardless if the data consumes
>>> less space. Otherwise the application or the PMD will have to
>>> manage memory itself.
>>>
>>> If we let the PMD manages the memory, what happens if a port/queue
>>> is closed and one or more buffers are still in use (switching)? I
>>> don't see how to solve this cleanly.
>>
>> Closing of the dev should return EBUSY till all buffers are free.
>> What is the use case of closing a port while still having packet
>> pending on other port of the switch? And why we cannot wait for them
>> to complete transmission?
> 
> The vswitch gets the request from outside and the assumption is that
> the command will succeed. AFAIK, there is no retry mechanism.
> 
> Thanks Shahaf!
> fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
  2019-10-03 16:57             ` Ilya Maximets
@ 2019-10-03 21:25               ` Flavio Leitner
  0 siblings, 0 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-03 21:25 UTC (permalink / raw)
  To: Ilya Maximets
  Cc: Maxime Coquelin, Shahaf Shuler, David Marchand, dev, Tiwei Bie,
	Zhihong Wang, Obrembski MichalX, Stokes Ian

On Thu, 3 Oct 2019 18:57:32 +0200
Ilya Maximets <i.maximets@ovn.org> wrote:

> On 02.10.2019 20:15, Flavio Leitner wrote:
> > On Wed, 2 Oct 2019 17:50:41 +0000
> > Shahaf Shuler <shahafs@mellanox.com> wrote:
> >   
> >> Wednesday, October 2, 2019 3:59 PM, Flavio Leitner:  
> >>> Obrembski MichalX <michalx.obrembski@intel.com>; Stokes Ian
> >>> <ian.stokes@intel.com>
> >>> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> >>> mbufs
> >>>
> >>>
> >>> Hi Shahaf,
> >>>
> >>> Thanks for looking into this, see my inline comments.
> >>>
> >>> On Wed, 2 Oct 2019 09:00:11 +0000
> >>> Shahaf Shuler <shahafs@mellanox.com> wrote:
> >>>      
> >>>> Wednesday, October 2, 2019 11:05 AM, David Marchand:  
> >>>>> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large
> >>>>> linear mbufs
> >>>>>
> >>>>> Hello Shahaf,
> >>>>>
> >>>>> On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler
> >>>>> <shahafs@mellanox.com> wrote:  
> >>>>>>     
> >>
> >> [...]
> >>  
> >>>>>
> >>>>> I am missing some piece here.
> >>>>> Which pool would the PMD take those external buffers from?  
> >>>>
> >>>> The mbuf is always taken from the single mempool associated w/
> >>>> the rxq. The buffer for the mbuf may be allocated (in case virtio
> >>>> payload is bigger than current mbuf size) from DPDK hugepages or
> >>>> any other system memory and be attached to the mbuf.
> >>>>
> >>>> You can see example implementation of it in mlx5 PMD (checkout
> >>>> rte_pktmbuf_attach_extbuf call)  
> >>>
> >>> Thanks, I wasn't aware of external buffers.
> >>>
> >>> I see that attaching external buffers of the correct size would be
> >>> more efficient in terms of saving memory/avoiding sparsing.
> >>>
> >>> However, we still need to be prepared to the worse case scenario
> >>> (all packets 64K), so that doesn't help with the total memory
> >>> required.  
> >>
> >> Am not sure why.
> >> The allocation can be per demand. That is - only when you
> >> encounter a large buffer.
> >>
> >> Having buffer allocated in advance will benefit only from removing
> >> the cost of the rte_*malloc. However on such big buffers, and
> >> further more w/ device offloads like TSO, am not sure that is an
> >> issue.  
> > 
> > Now I see what you're saying. I was thinking we had to reserve the
> > memory before, like mempool does, then get the buffers as needed.
> > 
> > OK, I can give a try with rte_*malloc and see how it goes.  
> 
> This way we actually could have a nice API.  For example, by
> introducing some new flag RTE_VHOST_USER_NO_CHAINED_MBUFS (there
> might be better name) which could be passed to driver_register().
> On receive, depending on this flag, function will create chained
> mbufs or allocate new contiguous memory chunk and attach it as
> an external buffer if the data could not be stored in a single
> mbuf from the registered memory pool.
> 
> Supporting external memory in mbufs will require some additional
> work from the OVS side (e.g. better work with ol_flags), but
> we'll have to do it anyway for upgrade to DPDK 19.11.

Agreed. Looks like rte_malloc is fast enough after all. I have a PoC
running iperf3 from VM to another baremetal using vhost-user client
with TSO enabled:
[...]
[  5] 140.00-141.00 sec  4.60 GBytes  39.5 Gbits/sec    0   1.26 MBytes
[  5] 141.00-142.00 sec  4.65 GBytes  39.9 Gbits/sec    0   1.26 MBytes
[  5] 142.00-143.00 sec  4.65 GBytes  40.0 Gbits/sec    0   1.26 MBytes
[  5] 143.00-144.00 sec  4.65 GBytes  39.9 Gbits/sec    9   1.04 MBytes
[  5] 144.00-145.00 sec  4.59 GBytes  39.4 Gbits/sec    0   1.16 MBytes
[  5] 145.00-146.00 sec  4.58 GBytes  39.3 Gbits/sec    0   1.26 MBytes
[  5] 146.00-147.00 sec  4.48 GBytes  38.5 Gbits/sec  700    973 KBytes
[...]
(The physical link is 40Gbps)

I will clean that, test more and post the patches soon.
Thanks!
fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v2] vhost: add support for large buffers.
  2019-10-01 22:19 [dpdk-dev] [PATCH] vhost: add support to large linear mbufs Flavio Leitner
                   ` (2 preceding siblings ...)
  2019-10-02  7:51 ` Maxime Coquelin
@ 2019-10-04 20:10 ` Flavio Leitner
  2019-10-06  4:47   ` Shahaf Shuler
                     ` (2 more replies)
  3 siblings, 3 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-04 20:10 UTC (permalink / raw)
  To: dev
  Cc: Ilya Maximets, Maxime Coquelin, Shahaf Shuler, David Marchand,
	Tiwei Bie, Obrembski MichalX, Stokes Ian

The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, let the host provide during registration
if attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner <fbl@sysclose.org>
---
 doc/guides/prog_guide/vhost_lib.rst |  35 ++++++++++
 lib/librte_vhost/rte_vhost.h        |   4 ++
 lib/librte_vhost/socket.c           |  10 +++
 lib/librte_vhost/vhost.c            |  22 ++++++
 lib/librte_vhost/vhost.h            |   4 ++
 lib/librte_vhost/virtio_net.c       | 103 ++++++++++++++++++++++++++--
 6 files changed, 172 insertions(+), 6 deletions(-)

- Changelog:
  V2:
    - Used rte_malloc() instead of another mempool as suggested by Shahaf.
    - Added the documentation section.
    - Using driver registration to negotiate the features.
    - OvS PoC code:
      https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API functions:
     Enabling this flag should only be done when the calling application does
     not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+    Enabling this flag forces vhost dequeue function to only provide linear
+    pktmbuf (no multi-segmented pktmbuf).
+
+    The vhost library by default provides a single pktmbuf for given a
+    packet, but if for some reason the data doesn't fit into a single
+    pktmbuf (e.g., TSO is enabled), the library will allocate additional
+    pktmbufs from the same mempool and chain them together to create a
+    multi-segmented pktmbuf.
+
+    However, the vhost application needs to support multi-segmented format.
+    If the vhost application does not support that format and requires large
+    buffers to be dequeue, this flag should be enabled to force only linear
+    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+    It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+    Enabling this flag allows vhost dequeue function to allocate and attach
+    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+    space to store all data.
+
+    This is useful when the vhost application wants to support large packets
+    but doesn't want to increase the default mempool object size nor to
+    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+    is allocated using rte_malloc() which gets attached to a pktmbuf using
+    rte_pktmbuf_attach_extbuf().
+
+    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+    mbufs for application that doesn't support chained mbufs.
+
+    It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..0ba610fda 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_socket {
 	bool dequeue_zero_copy;
 	bool iommu_support;
 	bool use_builtin_virtio_net;
+	bool extbuf;
+	bool linearbuf;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 	if (vsocket->dequeue_zero_copy)
 		vhost_enable_dequeue_zero_copy(vid);
 
+	if (vsocket->extbuf)
+		vhost_enable_extbuf(vid);
+
+	if (vsocket->linearbuf)
+		vhost_enable_linearbuf(vid);
+
 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 		goto out_free;
 	}
 	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
+	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
 
 	/*
 	 * Set the supported features correctly for the builtin vhost-user
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index cea44df8c..77457f538 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
 		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
 }
 
+void
+vhost_enable_extbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->extbuf = 1;
+}
+
+void
+vhost_enable_linearbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->linearbuf = 1;
+}
+
 int
 rte_vhost_get_mtu(int vid, uint16_t *mtu)
 {
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 5131a97a3..0346bd118 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -302,6 +302,8 @@ struct virtio_net {
 	rte_atomic16_t		broadcast_rarp;
 	uint32_t		nr_vring;
 	int			dequeue_zero_copy;
+	int			extbuf;
+	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
 	char			ifname[IF_NAME_SZ];
@@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
 void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
 void vhost_enable_dequeue_zero_copy(int vid);
 void vhost_set_builtin_virtio_net(int vid, bool enable);
+void vhost_enable_extbuf(int vid);
+void vhost_enable_linearbuf(int vid);
 
 struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..fca75161d 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -2,6 +2,7 @@
  * Copyright(c) 2010-2016 Intel Corporation
  */
 
+#include <assert.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <linux/virtio_net.h>
@@ -1289,6 +1290,96 @@ get_zmbuf(struct vhost_virtqueue *vq)
 	return NULL;
 }
 
+static void
+virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
+{
+	rte_free(opaque);
+}
+
+static int
+virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint16_t size)
+{
+	struct rte_mbuf_ext_shared_info *shinfo;
+	uint16_t buf_len;
+	rte_iova_t iova;
+	void *buf;
+
+	shinfo = NULL;
+	buf_len = size + RTE_PKTMBUF_HEADROOM;
+
+	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
+	 * required, otherwise store shinfo in the new buffer.
+	 */
+	if (rte_pktmbuf_tailroom(pkt) > sizeof(*shinfo))
+		shinfo = rte_pktmbuf_mtod(pkt,
+					  struct rte_mbuf_ext_shared_info *);
+	else {
+		if (unlikely(buf_len + sizeof(shinfo) > UINT16_MAX)) {
+			RTE_LOG(ERR, VHOST_DATA,
+				"buffer size exceeded maximum.\n");
+			return -ENOSPC;
+		}
+
+		buf_len += sizeof(shinfo);
+	}
+
+	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
+	if (unlikely(buf == NULL)) {
+		RTE_LOG(ERR, VHOST_DATA,
+				"Failed to allocate memory for mbuf.\n");
+		return -ENOMEM;
+	}
+
+	/* initialize shinfo */
+	if (shinfo) {
+		shinfo->free_cb = virtio_dev_extbuf_free;
+		shinfo->fcb_opaque = buf;
+		rte_mbuf_ext_refcnt_set(shinfo, 1);
+	} else {
+		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
+					      virtio_dev_extbuf_free, buf);
+		assert(shinfo);
+	}
+
+	iova = rte_mempool_virt2iova(buf);
+	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
+	rte_pktmbuf_reset_headroom(pkt);
+	assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
+
+	return 0;
+}
+
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint16_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL))
+		return NULL;
+
+	if (rte_pktmbuf_tailroom(pkt) >= data_len)
+		return pkt;
+
+	/* attach an external buffer if supported */
+	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
+		return pkt;
+
+	/* check if chained buffers are allowed */
+	if (!dev->linearbuf)
+		return pkt;
+
+	/* Data doesn't fit into the buffer and the host supports
+	 * only linear buffers
+	 */
+	rte_pktmbuf_free(pkt);
+
+	return NULL;
+}
+
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
@@ -1343,21 +1434,21 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t head_idx;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_split(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len,
+						&head_idx, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))
 			update_shadow_used_ring_split(vq, head_idx, 0);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
@@ -1451,14 +1542,14 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t buf_id;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t desc_count, nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_packed(dev, vq,
 						vq->last_avail_idx, &desc_count,
 						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
+						&buf_id, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
@@ -1466,7 +1557,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			update_shadow_used_ring_packed(vq, buf_id, 0,
 					desc_count);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
-- 
2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: add support for large buffers.
  2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
@ 2019-10-06  4:47   ` Shahaf Shuler
  2019-10-10  5:12   ` Tiwei Bie
  2019-10-11 17:09   ` [dpdk-dev] [PATCH v3] " Flavio Leitner
  2 siblings, 0 replies; 32+ messages in thread
From: Shahaf Shuler @ 2019-10-06  4:47 UTC (permalink / raw)
  To: Flavio Leitner, dev
  Cc: Ilya Maximets, Maxime Coquelin, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

Friday, October 4, 2019 11:10 PM, Flavio Leitner:
> Subject: [dpdk-dev] [PATCH v2] vhost: add support for large buffers.
> 
> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> If the data fits into a buffer, then all data is copied and a single linear buffer is
> returned. Otherwise it allocates additional mbufs and chains them together
> to return a multiple segments mbuf.
> 
> While that covers most use cases, it forces applications that need to work
> with larger data sizes to support multiple segments mbufs. The non-linear
> characteristic brings complexity and performance implications to the
> application.
> 
> To resolve the issue, let the host provide during registration if attaching an
> external buffer to pktmbuf is supported and if only linear buffer are
> supported.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>

Havn't reviewed the code in details. But for the high level direction
Acked-by: Shahaf Shuler <shahafs@mellanox.com>

> ---
>  doc/guides/prog_guide/vhost_lib.rst |  35 ++++++++++
>  lib/librte_vhost/rte_vhost.h        |   4 ++
>  lib/librte_vhost/socket.c           |  10 +++
>  lib/librte_vhost/vhost.c            |  22 ++++++
>  lib/librte_vhost/vhost.h            |   4 ++
>  lib/librte_vhost/virtio_net.c       | 103 ++++++++++++++++++++++++++--
>  6 files changed, 172 insertions(+), 6 deletions(-)
> 
> - Changelog:
>   V2:
>     - Used rte_malloc() instead of another mempool as suggested by Shahaf.
>     - Added the documentation section.
>     - Using driver registration to negotiate the features.
>     - OvS PoC code:
> 
> https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
> b.com%2Ffleitner%2Fovs%2Fcommit%2F8fc197c40b1d4fda331686a7b919e9e
> 2b670dda7&amp;data=02%7C01%7Cshahafs%40mellanox.com%7C505d8fd73
> 8da4cfe988a08d74906e87d%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7
> C0%7C637058166345885355&amp;sdata=SlYdj%2BoTG%2BufcAgkwp%2FfHo
> nXR6x4JuGvMmr7gZGsJRE%3D&amp;reserved=0
> 
> diff --git a/doc/guides/prog_guide/vhost_lib.rst
> b/doc/guides/prog_guide/vhost_lib.rst
> index fc3ee4353..07e40e3c5 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -117,6 +117,41 @@ The following is an overview of some key Vhost API
> functions:
>      Enabling this flag should only be done when the calling application does
>      not pre-fault the guest shared memory, otherwise migration would fail.
> 
> +  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
> +
> +    Enabling this flag forces vhost dequeue function to only provide linear
> +    pktmbuf (no multi-segmented pktmbuf).
> +
> +    The vhost library by default provides a single pktmbuf for given a
> +    packet, but if for some reason the data doesn't fit into a single
> +    pktmbuf (e.g., TSO is enabled), the library will allocate additional
> +    pktmbufs from the same mempool and chain them together to create a
> +    multi-segmented pktmbuf.
> +
> +    However, the vhost application needs to support multi-segmented
> format.
> +    If the vhost application does not support that format and requires large
> +    buffers to be dequeue, this flag should be enabled to force only linear
> +    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
> +
> +    It is disabled by default.
> +
> +  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
> +
> +    Enabling this flag allows vhost dequeue function to allocate and attach
> +    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
> +    space to store all data.
> +
> +    This is useful when the vhost application wants to support large packets
> +    but doesn't want to increase the default mempool object size nor to
> +    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
> +    is allocated using rte_malloc() which gets attached to a pktmbuf using
> +    rte_pktmbuf_attach_extbuf().
> +
> +    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-
> segmented
> +    mbufs for application that doesn't support chained mbufs.
> +
> +    It is disabled by default.
> +
>  * ``rte_vhost_driver_set_features(path, features)``
> 
>    This function sets the feature bits the vhost-user driver supports. The diff --
> git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index
> 19474bca0..b821b5df4 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -30,6 +30,10 @@ extern "C" {
>  #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
>  #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
>  #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
> +/* support mbuf with external buffer attached */
> +#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
> +/* support only linear buffers (no chained mbufs) */
> +#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
> 
>  /** Protocol features. */
>  #ifndef VHOST_USER_PROTOCOL_F_MQ
> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c index
> 274988c4d..0ba610fda 100644
> --- a/lib/librte_vhost/socket.c
> +++ b/lib/librte_vhost/socket.c
> @@ -40,6 +40,8 @@ struct vhost_user_socket {
>  	bool dequeue_zero_copy;
>  	bool iommu_support;
>  	bool use_builtin_virtio_net;
> +	bool extbuf;
> +	bool linearbuf;
> 
>  	/*
>  	 * The "supported_features" indicates the feature bits the @@ -
> 232,6 +234,12 @@ vhost_user_add_connection(int fd, struct
> vhost_user_socket *vsocket)
>  	if (vsocket->dequeue_zero_copy)
>  		vhost_enable_dequeue_zero_copy(vid);
> 
> +	if (vsocket->extbuf)
> +		vhost_enable_extbuf(vid);
> +
> +	if (vsocket->linearbuf)
> +		vhost_enable_linearbuf(vid);
> +
>  	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
> 
>  	if (vsocket->notify_ops->new_connection) { @@ -870,6 +878,8 @@
> rte_vhost_driver_register(const char *path, uint64_t flags)
>  		goto out_free;
>  	}
>  	vsocket->dequeue_zero_copy = flags &
> RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> +	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> +	vsocket->linearbuf = flags &
> RTE_VHOST_USER_LINEARBUF_SUPPORT;
> 
>  	/*
>  	 * Set the supported features correctly for the builtin vhost-user diff
> --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> cea44df8c..77457f538 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
>  		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;  }
> 
> +void
> +vhost_enable_extbuf(int vid)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return;
> +
> +	dev->extbuf = 1;
> +}
> +
> +void
> +vhost_enable_linearbuf(int vid)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return;
> +
> +	dev->linearbuf = 1;
> +}
> +
>  int
>  rte_vhost_get_mtu(int vid, uint16_t *mtu)  { diff --git
> a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> 5131a97a3..0346bd118 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -302,6 +302,8 @@ struct virtio_net {
>  	rte_atomic16_t		broadcast_rarp;
>  	uint32_t		nr_vring;
>  	int			dequeue_zero_copy;
> +	int			extbuf;
> +	int			linearbuf;
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS *
> 2];
>  #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>  	char			ifname[IF_NAME_SZ];
> @@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);  void
> vhost_set_ifname(int, const char *if_name, unsigned int if_len);  void
> vhost_enable_dequeue_zero_copy(int vid);  void
> vhost_set_builtin_virtio_net(int vid, bool enable);
> +void vhost_enable_extbuf(int vid);
> +void vhost_enable_linearbuf(int vid);
> 
>  struct vhost_device_ops const *vhost_driver_callback_get(const char
> *path);
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index
> 5b85b832d..fca75161d 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -2,6 +2,7 @@
>   * Copyright(c) 2010-2016 Intel Corporation
>   */
> 
> +#include <assert.h>
>  #include <stdint.h>
>  #include <stdbool.h>
>  #include <linux/virtio_net.h>
> @@ -1289,6 +1290,96 @@ get_zmbuf(struct vhost_virtqueue *vq)
>  	return NULL;
>  }
> 
> +static void
> +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque) {
> +	rte_free(opaque);
> +}
> +
> +static int
> +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint16_t size) {
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +	uint16_t buf_len;
> +	rte_iova_t iova;
> +	void *buf;
> +
> +	shinfo = NULL;
> +	buf_len = size + RTE_PKTMBUF_HEADROOM;
> +
> +	/* Try to use pkt buffer to store shinfo to reduce the amount of
> memory
> +	 * required, otherwise store shinfo in the new buffer.
> +	 */
> +	if (rte_pktmbuf_tailroom(pkt) > sizeof(*shinfo))
> +		shinfo = rte_pktmbuf_mtod(pkt,
> +					  struct rte_mbuf_ext_shared_info
> *);
> +	else {
> +		if (unlikely(buf_len + sizeof(shinfo) > UINT16_MAX)) {
> +			RTE_LOG(ERR, VHOST_DATA,
> +				"buffer size exceeded maximum.\n");
> +			return -ENOSPC;
> +		}
> +
> +		buf_len += sizeof(shinfo);
> +	}
> +
> +	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> +	if (unlikely(buf == NULL)) {
> +		RTE_LOG(ERR, VHOST_DATA,
> +				"Failed to allocate memory for mbuf.\n");
> +		return -ENOMEM;
> +	}
> +
> +	/* initialize shinfo */
> +	if (shinfo) {
> +		shinfo->free_cb = virtio_dev_extbuf_free;
> +		shinfo->fcb_opaque = buf;
> +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> +	} else {
> +		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf,
> &buf_len,
> +					      virtio_dev_extbuf_free, buf);
> +		assert(shinfo);
> +	}
> +
> +	iova = rte_mempool_virt2iova(buf);
> +	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
> +	rte_pktmbuf_reset_headroom(pkt);
> +	assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
> +
> +	return 0;
> +}
> +
> +/*
> + * Allocate a host supported pktmbuf.
> + */
> +static __rte_always_inline struct rte_mbuf *
> +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
> +			 uint16_t data_len)
> +{
> +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> +
> +	if (unlikely(pkt == NULL))
> +		return NULL;
> +
> +	if (rte_pktmbuf_tailroom(pkt) >= data_len)
> +		return pkt;
> +
> +	/* attach an external buffer if supported */
> +	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
> +		return pkt;
> +
> +	/* check if chained buffers are allowed */
> +	if (!dev->linearbuf)
> +		return pkt;
> +
> +	/* Data doesn't fit into the buffer and the host supports
> +	 * only linear buffers
> +	 */
> +	rte_pktmbuf_free(pkt);
> +
> +	return NULL;
> +}
> +
>  static __rte_noinline uint16_t
>  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count) @@ -1343,21 +1434,21 @@ virtio_dev_tx_split(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t head_idx;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t nr_vec = 0;
>  		int err;
> 
>  		if (unlikely(fill_vec_buf_split(dev, vq,
>  						vq->last_avail_idx + i,
>  						&nr_vec, buf_vec,
> -						&head_idx, &dummy_len,
> +						&head_idx, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
> 
>  		if (likely(dev->dequeue_zero_copy == 0))
>  			update_shadow_used_ring_split(vq, head_idx, 0);
> 
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> buf_len);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
> @@ -1451,14 +1542,14 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t buf_id;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t desc_count, nr_vec = 0;
>  		int err;
> 
>  		if (unlikely(fill_vec_buf_packed(dev, vq,
>  						vq->last_avail_idx,
> &desc_count,
>  						buf_vec, &nr_vec,
> -						&buf_id, &dummy_len,
> +						&buf_id, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
> 
> @@ -1466,7 +1557,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>  			update_shadow_used_ring_packed(vq, buf_id, 0,
>  					desc_count);
> 
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> buf_len);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: add support for large buffers.
  2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
  2019-10-06  4:47   ` Shahaf Shuler
@ 2019-10-10  5:12   ` Tiwei Bie
  2019-10-10 12:12     ` Flavio Leitner
  2019-10-11 17:09   ` [dpdk-dev] [PATCH v3] " Flavio Leitner
  2 siblings, 1 reply; 32+ messages in thread
From: Tiwei Bie @ 2019-10-10  5:12 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: dev, Ilya Maximets, Maxime Coquelin, Shahaf Shuler,
	David Marchand, Obrembski MichalX, Stokes Ian


[PATCH v2] vhost: add support for large buffers.

There is a warning reported by devtools/check-git-log.sh
The '.' at the end of the title should be dropped.

On Fri, Oct 04, 2019 at 05:10:08PM -0300, Flavio Leitner wrote:
> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> If the data fits into a buffer, then all data is copied and a
> single linear buffer is returned. Otherwise it allocates
> additional mbufs and chains them together to return a multiple
> segments mbuf.
> 
> While that covers most use cases, it forces applications that
> need to work with larger data sizes to support multiple segments
> mbufs. The non-linear characteristic brings complexity and
> performance implications to the application.
> 
> To resolve the issue, let the host provide during registration
> if attaching an external buffer to pktmbuf is supported and if
> only linear buffer are supported.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  35 ++++++++++
>  lib/librte_vhost/rte_vhost.h        |   4 ++
>  lib/librte_vhost/socket.c           |  10 +++
>  lib/librte_vhost/vhost.c            |  22 ++++++
>  lib/librte_vhost/vhost.h            |   4 ++
>  lib/librte_vhost/virtio_net.c       | 103 ++++++++++++++++++++++++++--
>  6 files changed, 172 insertions(+), 6 deletions(-)
> 
> - Changelog:
>   V2:
>     - Used rte_malloc() instead of another mempool as suggested by Shahaf.
>     - Added the documentation section.
>     - Using driver registration to negotiate the features.
>     - OvS PoC code:
>       https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> 
> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index fc3ee4353..07e40e3c5 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -117,6 +117,41 @@ The following is an overview of some key Vhost API functions:
>      Enabling this flag should only be done when the calling application does
>      not pre-fault the guest shared memory, otherwise migration would fail.
>  
> +  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
> +
> +    Enabling this flag forces vhost dequeue function to only provide linear
> +    pktmbuf (no multi-segmented pktmbuf).
> +
> +    The vhost library by default provides a single pktmbuf for given a
> +    packet, but if for some reason the data doesn't fit into a single
> +    pktmbuf (e.g., TSO is enabled), the library will allocate additional
> +    pktmbufs from the same mempool and chain them together to create a
> +    multi-segmented pktmbuf.
> +
> +    However, the vhost application needs to support multi-segmented format.
> +    If the vhost application does not support that format and requires large
> +    buffers to be dequeue, this flag should be enabled to force only linear
> +    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
> +
> +    It is disabled by default.
> +
> +  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
> +
> +    Enabling this flag allows vhost dequeue function to allocate and attach
> +    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
> +    space to store all data.
> +
> +    This is useful when the vhost application wants to support large packets
> +    but doesn't want to increase the default mempool object size nor to
> +    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
> +    is allocated using rte_malloc() which gets attached to a pktmbuf using
> +    rte_pktmbuf_attach_extbuf().
> +
> +    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
> +    mbufs for application that doesn't support chained mbufs.
> +
> +    It is disabled by default.
> +
>  * ``rte_vhost_driver_set_features(path, features)``
>  
>    This function sets the feature bits the vhost-user driver supports. The
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index 19474bca0..b821b5df4 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -30,6 +30,10 @@ extern "C" {
>  #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
>  #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
>  #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
> +/* support mbuf with external buffer attached */
> +#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
> +/* support only linear buffers (no chained mbufs) */
> +#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
>  
>  /** Protocol features. */
>  #ifndef VHOST_USER_PROTOCOL_F_MQ
> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
> index 274988c4d..0ba610fda 100644
> --- a/lib/librte_vhost/socket.c
> +++ b/lib/librte_vhost/socket.c
> @@ -40,6 +40,8 @@ struct vhost_user_socket {
>  	bool dequeue_zero_copy;
>  	bool iommu_support;
>  	bool use_builtin_virtio_net;
> +	bool extbuf;
> +	bool linearbuf;
>  
>  	/*
>  	 * The "supported_features" indicates the feature bits the
> @@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
>  	if (vsocket->dequeue_zero_copy)
>  		vhost_enable_dequeue_zero_copy(vid);
>  
> +	if (vsocket->extbuf)
> +		vhost_enable_extbuf(vid);
> +
> +	if (vsocket->linearbuf)
> +		vhost_enable_linearbuf(vid);
> +
>  	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
>  
>  	if (vsocket->notify_ops->new_connection) {
> @@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
>  		goto out_free;
>  	}
>  	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> +	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> +	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;

It could be problematic to use dequeue_zero_copy with
above two features at the same time.
We may want to reject such combination.

>  
>  	/*
>  	 * Set the supported features correctly for the builtin vhost-user
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index cea44df8c..77457f538 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
>  		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
>  }
>  
> +void
> +vhost_enable_extbuf(int vid)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return;
> +
> +	dev->extbuf = 1;
> +}
> +
> +void
> +vhost_enable_linearbuf(int vid)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return;
> +
> +	dev->linearbuf = 1;
> +}
> +
>  int
>  rte_vhost_get_mtu(int vid, uint16_t *mtu)
>  {
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 5131a97a3..0346bd118 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -302,6 +302,8 @@ struct virtio_net {
>  	rte_atomic16_t		broadcast_rarp;
>  	uint32_t		nr_vring;
>  	int			dequeue_zero_copy;
> +	int			extbuf;
> +	int			linearbuf;
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>  #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>  	char			ifname[IF_NAME_SZ];
> @@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
>  void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
>  void vhost_enable_dequeue_zero_copy(int vid);
>  void vhost_set_builtin_virtio_net(int vid, bool enable);
> +void vhost_enable_extbuf(int vid);
> +void vhost_enable_linearbuf(int vid);
>  
>  struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
>  
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5b85b832d..fca75161d 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -2,6 +2,7 @@
>   * Copyright(c) 2010-2016 Intel Corporation
>   */
>  
> +#include <assert.h>
>  #include <stdint.h>
>  #include <stdbool.h>
>  #include <linux/virtio_net.h>
> @@ -1289,6 +1290,96 @@ get_zmbuf(struct vhost_virtqueue *vq)
>  	return NULL;
>  }
>  
> +static void
> +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
> +{
> +	rte_free(opaque);
> +}
> +
> +static int
> +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint16_t size)
> +{
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +	uint16_t buf_len;

It also includes RTE_PKTMBUF_HEADROOM and sizeof(*shinfo).
Is uint16_t big enough?

> +	rte_iova_t iova;
> +	void *buf;
> +
> +	shinfo = NULL;
> +	buf_len = size + RTE_PKTMBUF_HEADROOM;
> +
> +	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
> +	 * required, otherwise store shinfo in the new buffer.
> +	 */
> +	if (rte_pktmbuf_tailroom(pkt) > sizeof(*shinfo))
> +		shinfo = rte_pktmbuf_mtod(pkt,
> +					  struct rte_mbuf_ext_shared_info *);
> +	else {
> +		if (unlikely(buf_len + sizeof(shinfo) > UINT16_MAX)) {

Should be sizeof(*shinfo)

> +			RTE_LOG(ERR, VHOST_DATA,
> +				"buffer size exceeded maximum.\n");
> +			return -ENOSPC;
> +		}
> +
> +		buf_len += sizeof(shinfo);

Ditto.

> +	}
> +
> +	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> +	if (unlikely(buf == NULL)) {
> +		RTE_LOG(ERR, VHOST_DATA,
> +				"Failed to allocate memory for mbuf.\n");

Looks better to just have 3 tabs before the string.

> +		return -ENOMEM;
> +	}
> +
> +	/* initialize shinfo */
> +	if (shinfo) {
> +		shinfo->free_cb = virtio_dev_extbuf_free;
> +		shinfo->fcb_opaque = buf;
> +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> +	} else {
> +		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> +					      virtio_dev_extbuf_free, buf);
> +		assert(shinfo);

As a library, we shouldn't abort the process.
We can just return the error.

> +	}
> +
> +	iova = rte_mempool_virt2iova(buf);

Should be rte_malloc_virt2iova().

> +	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
> +	rte_pktmbuf_reset_headroom(pkt);
> +	assert(pkt->ol_flags == EXT_ATTACHED_MBUF);

Ditto.

> +
> +	return 0;
> +}
> +
> +/*
> + * Allocate a host supported pktmbuf.
> + */
> +static __rte_always_inline struct rte_mbuf *
> +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
> +			 uint16_t data_len)
> +{
> +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> +
> +	if (unlikely(pkt == NULL))
> +		return NULL;
> +
> +	if (rte_pktmbuf_tailroom(pkt) >= data_len)
> +		return pkt;
> +
> +	/* attach an external buffer if supported */
> +	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
> +		return pkt;
> +
> +	/* check if chained buffers are allowed */
> +	if (!dev->linearbuf)
> +		return pkt;
> +
> +	/* Data doesn't fit into the buffer and the host supports
> +	 * only linear buffers
> +	 */
> +	rte_pktmbuf_free(pkt);
> +
> +	return NULL;
> +}
> +
>  static __rte_noinline uint16_t
>  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
> @@ -1343,21 +1434,21 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t head_idx;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t nr_vec = 0;
>  		int err;
>  
>  		if (unlikely(fill_vec_buf_split(dev, vq,
>  						vq->last_avail_idx + i,
>  						&nr_vec, buf_vec,
> -						&head_idx, &dummy_len,
> +						&head_idx, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
>  
>  		if (likely(dev->dequeue_zero_copy == 0))
>  			update_shadow_used_ring_split(vq, head_idx, 0);
>  
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
> @@ -1451,14 +1542,14 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t buf_id;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t desc_count, nr_vec = 0;
>  		int err;
>  
>  		if (unlikely(fill_vec_buf_packed(dev, vq,
>  						vq->last_avail_idx, &desc_count,
>  						buf_vec, &nr_vec,
> -						&buf_id, &dummy_len,
> +						&buf_id, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
>  
> @@ -1466,7 +1557,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			update_shadow_used_ring_packed(vq, buf_id, 0,
>  					desc_count);
>  
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: add support for large buffers.
  2019-10-10  5:12   ` Tiwei Bie
@ 2019-10-10 12:12     ` Flavio Leitner
  0 siblings, 0 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-10 12:12 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: dev, Ilya Maximets, Maxime Coquelin, Shahaf Shuler,
	David Marchand, Obrembski MichalX, Stokes Ian


Hi Tiwei,

Thanks for the review. I will follow up with an improved patch.
fbl

On Thu, 10 Oct 2019 13:12:57 +0800
Tiwei Bie <tiwei.bie@intel.com> wrote:

> [PATCH v2] vhost: add support for large buffers.
> 
> There is a warning reported by devtools/check-git-log.sh
> The '.' at the end of the title should be dropped.
> 
> On Fri, Oct 04, 2019 at 05:10:08PM -0300, Flavio Leitner wrote:
> > The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> > If the data fits into a buffer, then all data is copied and a
> > single linear buffer is returned. Otherwise it allocates
> > additional mbufs and chains them together to return a multiple
> > segments mbuf.
> > 
> > While that covers most use cases, it forces applications that
> > need to work with larger data sizes to support multiple segments
> > mbufs. The non-linear characteristic brings complexity and
> > performance implications to the application.
> > 
> > To resolve the issue, let the host provide during registration
> > if attaching an external buffer to pktmbuf is supported and if
> > only linear buffer are supported.
> > 
> > Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> > ---
> >  doc/guides/prog_guide/vhost_lib.rst |  35 ++++++++++
> >  lib/librte_vhost/rte_vhost.h        |   4 ++
> >  lib/librte_vhost/socket.c           |  10 +++
> >  lib/librte_vhost/vhost.c            |  22 ++++++
> >  lib/librte_vhost/vhost.h            |   4 ++
> >  lib/librte_vhost/virtio_net.c       | 103
> > ++++++++++++++++++++++++++-- 6 files changed, 172 insertions(+), 6
> > deletions(-)
> > 
> > - Changelog:
> >   V2:
> >     - Used rte_malloc() instead of another mempool as suggested by
> > Shahaf.
> >     - Added the documentation section.
> >     - Using driver registration to negotiate the features.
> >     - OvS PoC code:
> >       https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> > 
> > diff --git a/doc/guides/prog_guide/vhost_lib.rst
> > b/doc/guides/prog_guide/vhost_lib.rst index fc3ee4353..07e40e3c5
> > 100644 --- a/doc/guides/prog_guide/vhost_lib.rst
> > +++ b/doc/guides/prog_guide/vhost_lib.rst
> > @@ -117,6 +117,41 @@ The following is an overview of some key Vhost
> > API functions: Enabling this flag should only be done when the
> > calling application does not pre-fault the guest shared memory,
> > otherwise migration would fail. 
> > +  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
> > +
> > +    Enabling this flag forces vhost dequeue function to only
> > provide linear
> > +    pktmbuf (no multi-segmented pktmbuf).
> > +
> > +    The vhost library by default provides a single pktmbuf for
> > given a
> > +    packet, but if for some reason the data doesn't fit into a
> > single
> > +    pktmbuf (e.g., TSO is enabled), the library will allocate
> > additional
> > +    pktmbufs from the same mempool and chain them together to
> > create a
> > +    multi-segmented pktmbuf.
> > +
> > +    However, the vhost application needs to support
> > multi-segmented format.
> > +    If the vhost application does not support that format and
> > requires large
> > +    buffers to be dequeue, this flag should be enabled to force
> > only linear
> > +    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
> > +
> > +    It is disabled by default.
> > +
> > +  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
> > +
> > +    Enabling this flag allows vhost dequeue function to allocate
> > and attach
> > +    an external buffer to a pktmbuf if the pkmbuf doesn't provide
> > enough
> > +    space to store all data.
> > +
> > +    This is useful when the vhost application wants to support
> > large packets
> > +    but doesn't want to increase the default mempool object size
> > nor to
> > +    support multi-segmented mbufs (non-linear). In this case, a
> > fresh buffer
> > +    is allocated using rte_malloc() which gets attached to a
> > pktmbuf using
> > +    rte_pktmbuf_attach_extbuf().
> > +
> > +    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable
> > multi-segmented
> > +    mbufs for application that doesn't support chained mbufs.
> > +
> > +    It is disabled by default.
> > +
> >  * ``rte_vhost_driver_set_features(path, features)``
> >  
> >    This function sets the feature bits the vhost-user driver
> > supports. The diff --git a/lib/librte_vhost/rte_vhost.h
> > b/lib/librte_vhost/rte_vhost.h index 19474bca0..b821b5df4 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -30,6 +30,10 @@ extern "C" {
> >  #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
> >  #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
> >  #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
> > +/* support mbuf with external buffer attached */
> > +#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
> > +/* support only linear buffers (no chained mbufs) */
> > +#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
> >  
> >  /** Protocol features. */
> >  #ifndef VHOST_USER_PROTOCOL_F_MQ
> > diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
> > index 274988c4d..0ba610fda 100644
> > --- a/lib/librte_vhost/socket.c
> > +++ b/lib/librte_vhost/socket.c
> > @@ -40,6 +40,8 @@ struct vhost_user_socket {
> >  	bool dequeue_zero_copy;
> >  	bool iommu_support;
> >  	bool use_builtin_virtio_net;
> > +	bool extbuf;
> > +	bool linearbuf;
> >  
> >  	/*
> >  	 * The "supported_features" indicates the feature bits the
> > @@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct
> > vhost_user_socket *vsocket) if (vsocket->dequeue_zero_copy)
> >  		vhost_enable_dequeue_zero_copy(vid);
> >  
> > +	if (vsocket->extbuf)
> > +		vhost_enable_extbuf(vid);
> > +
> > +	if (vsocket->linearbuf)
> > +		vhost_enable_linearbuf(vid);
> > +
> >  	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n",
> > vid); 
> >  	if (vsocket->notify_ops->new_connection) {
> > @@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path,
> > uint64_t flags) goto out_free;
> >  	}
> >  	vsocket->dequeue_zero_copy = flags &
> > RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> > +	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> > +	vsocket->linearbuf = flags &
> > RTE_VHOST_USER_LINEARBUF_SUPPORT;  
> 
> It could be problematic to use dequeue_zero_copy with
> above two features at the same time.
> We may want to reject such combination.
> 
> >  
> >  	/*
> >  	 * Set the supported features correctly for the builtin
> > vhost-user diff --git a/lib/librte_vhost/vhost.c
> > b/lib/librte_vhost/vhost.c index cea44df8c..77457f538 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool
> > enable) dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
> >  }
> >  
> > +void
> > +vhost_enable_extbuf(int vid)
> > +{
> > +	struct virtio_net *dev = get_device(vid);
> > +
> > +	if (dev == NULL)
> > +		return;
> > +
> > +	dev->extbuf = 1;
> > +}
> > +
> > +void
> > +vhost_enable_linearbuf(int vid)
> > +{
> > +	struct virtio_net *dev = get_device(vid);
> > +
> > +	if (dev == NULL)
> > +		return;
> > +
> > +	dev->linearbuf = 1;
> > +}
> > +
> >  int
> >  rte_vhost_get_mtu(int vid, uint16_t *mtu)
> >  {
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 5131a97a3..0346bd118 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -302,6 +302,8 @@ struct virtio_net {
> >  	rte_atomic16_t		broadcast_rarp;
> >  	uint32_t		nr_vring;
> >  	int			dequeue_zero_copy;
> > +	int			extbuf;
> > +	int			linearbuf;
> >  	struct vhost_virtqueue
> > *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; #define IF_NAME_SZ (PATH_MAX
> > > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) char
> > > ifname[IF_NAME_SZ];
> > @@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
> >  void vhost_set_ifname(int, const char *if_name, unsigned int
> > if_len); void vhost_enable_dequeue_zero_copy(int vid);
> >  void vhost_set_builtin_virtio_net(int vid, bool enable);
> > +void vhost_enable_extbuf(int vid);
> > +void vhost_enable_linearbuf(int vid);
> >  
> >  struct vhost_device_ops const *vhost_driver_callback_get(const
> > char *path); 
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 5b85b832d..fca75161d 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -2,6 +2,7 @@
> >   * Copyright(c) 2010-2016 Intel Corporation
> >   */
> >  
> > +#include <assert.h>
> >  #include <stdint.h>
> >  #include <stdbool.h>
> >  #include <linux/virtio_net.h>
> > @@ -1289,6 +1290,96 @@ get_zmbuf(struct vhost_virtqueue *vq)
> >  	return NULL;
> >  }
> >  
> > +static void
> > +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
> > +{
> > +	rte_free(opaque);
> > +}
> > +
> > +static int
> > +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint16_t size)
> > +{
> > +	struct rte_mbuf_ext_shared_info *shinfo;
> > +	uint16_t buf_len;  
> 
> It also includes RTE_PKTMBUF_HEADROOM and sizeof(*shinfo).
> Is uint16_t big enough?
> 
> > +	rte_iova_t iova;
> > +	void *buf;
> > +
> > +	shinfo = NULL;
> > +	buf_len = size + RTE_PKTMBUF_HEADROOM;
> > +
> > +	/* Try to use pkt buffer to store shinfo to reduce the
> > amount of memory
> > +	 * required, otherwise store shinfo in the new buffer.
> > +	 */
> > +	if (rte_pktmbuf_tailroom(pkt) > sizeof(*shinfo))
> > +		shinfo = rte_pktmbuf_mtod(pkt,
> > +					  struct
> > rte_mbuf_ext_shared_info *);
> > +	else {
> > +		if (unlikely(buf_len + sizeof(shinfo) >
> > UINT16_MAX)) {  
> 
> Should be sizeof(*shinfo)
> 
> > +			RTE_LOG(ERR, VHOST_DATA,
> > +				"buffer size exceeded maximum.\n");
> > +			return -ENOSPC;
> > +		}
> > +
> > +		buf_len += sizeof(shinfo);  
> 
> Ditto.
> 
> > +	}
> > +
> > +	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> > +	if (unlikely(buf == NULL)) {
> > +		RTE_LOG(ERR, VHOST_DATA,
> > +				"Failed to allocate memory for
> > mbuf.\n");  
> 
> Looks better to just have 3 tabs before the string.
> 
> > +		return -ENOMEM;
> > +	}
> > +
> > +	/* initialize shinfo */
> > +	if (shinfo) {
> > +		shinfo->free_cb = virtio_dev_extbuf_free;
> > +		shinfo->fcb_opaque = buf;
> > +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> > +	} else {
> > +		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf,
> > &buf_len,
> > +
> > virtio_dev_extbuf_free, buf);
> > +		assert(shinfo);  
> 
> As a library, we shouldn't abort the process.
> We can just return the error.
> 
> > +	}
> > +
> > +	iova = rte_mempool_virt2iova(buf);  
> 
> Should be rte_malloc_virt2iova().
> 
> > +	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
> > +	rte_pktmbuf_reset_headroom(pkt);
> > +	assert(pkt->ol_flags == EXT_ATTACHED_MBUF);  
> 
> Ditto.
> 
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Allocate a host supported pktmbuf.
> > + */
> > +static __rte_always_inline struct rte_mbuf *
> > +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct
> > rte_mempool *mp,
> > +			 uint16_t data_len)
> > +{
> > +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> > +
> > +	if (unlikely(pkt == NULL))
> > +		return NULL;
> > +
> > +	if (rte_pktmbuf_tailroom(pkt) >= data_len)
> > +		return pkt;
> > +
> > +	/* attach an external buffer if supported */
> > +	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
> > +		return pkt;
> > +
> > +	/* check if chained buffers are allowed */
> > +	if (!dev->linearbuf)
> > +		return pkt;
> > +
> > +	/* Data doesn't fit into the buffer and the host supports
> > +	 * only linear buffers
> > +	 */
> > +	rte_pktmbuf_free(pkt);
> > +
> > +	return NULL;
> > +}
> > +
> >  static __rte_noinline uint16_t
> >  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue
> > *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
> > uint16_t count) @@ -1343,21 +1434,21 @@ virtio_dev_tx_split(struct
> > virtio_net *dev, struct vhost_virtqueue *vq, for (i = 0; i < count;
> > i++) { struct buf_vector buf_vec[BUF_VECTOR_MAX];
> >  		uint16_t head_idx;
> > -		uint32_t dummy_len;
> > +		uint32_t buf_len;
> >  		uint16_t nr_vec = 0;
> >  		int err;
> >  
> >  		if (unlikely(fill_vec_buf_split(dev, vq,
> >  						vq->last_avail_idx
> > + i, &nr_vec, buf_vec,
> > -						&head_idx,
> > &dummy_len,
> > +						&head_idx,
> > &buf_len, VHOST_ACCESS_RO) < 0))
> >  			break;
> >  
> >  		if (likely(dev->dequeue_zero_copy == 0))
> >  			update_shadow_used_ring_split(vq,
> > head_idx, 0); 
> > -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> > buf_len); if (unlikely(pkts[i] == NULL)) {
> >  			RTE_LOG(ERR, VHOST_DATA,
> >  				"Failed to allocate memory for
> > mbuf.\n"); @@ -1451,14 +1542,14 @@ virtio_dev_tx_packed(struct
> > virtio_net *dev, struct vhost_virtqueue *vq, for (i = 0; i < count;
> > i++) { struct buf_vector buf_vec[BUF_VECTOR_MAX];
> >  		uint16_t buf_id;
> > -		uint32_t dummy_len;
> > +		uint32_t buf_len;
> >  		uint16_t desc_count, nr_vec = 0;
> >  		int err;
> >  
> >  		if (unlikely(fill_vec_buf_packed(dev, vq,
> >  						vq->last_avail_idx,
> > &desc_count, buf_vec, &nr_vec,
> > -						&buf_id,
> > &dummy_len,
> > +						&buf_id, &buf_len,
> >  						VHOST_ACCESS_RO) <
> > 0)) break;
> >  
> > @@ -1466,7 +1557,7 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> > struct vhost_virtqueue *vq, update_shadow_used_ring_packed(vq,
> > buf_id, 0, desc_count);
> >  
> > -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> > buf_len); if (unlikely(pkts[i] == NULL)) {
> >  			RTE_LOG(ERR, VHOST_DATA,
> >  				"Failed to allocate memory for
> > mbuf.\n"); -- 
> > 2.20.1
> >   


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v3] vhost: add support for large buffers
  2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
  2019-10-06  4:47   ` Shahaf Shuler
  2019-10-10  5:12   ` Tiwei Bie
@ 2019-10-11 17:09   ` Flavio Leitner
  2019-10-14  2:44     ` Tiwei Bie
  2019-10-15 16:17     ` [dpdk-dev] [PATCH v4] " Flavio Leitner
  2 siblings, 2 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-11 17:09 UTC (permalink / raw)
  To: dev
  Cc: Ilya Maximets, Maxime Coquelin, Shahaf Shuler, David Marchand,
	Tiwei Bie, Obrembski MichalX, Stokes Ian

The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, add support to attach external buffer
to a pktmbuf and let the host provide during registration if
attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner <fbl@sysclose.org>
---
 doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
 lib/librte_vhost/rte_vhost.h        |   4 ++
 lib/librte_vhost/socket.c           |  22 ++++++
 lib/librte_vhost/vhost.c            |  22 ++++++
 lib/librte_vhost/vhost.h            |   4 ++
 lib/librte_vhost/virtio_net.c       | 108 ++++++++++++++++++++++++----
 6 files changed, 181 insertions(+), 14 deletions(-)

- Changelog:
  V3:
    - prevent the new features to be used with zero copy
    - fixed sizeof() usage
    - fixed log msg indentation
    - removed/replaced asserts
    - used the correct virt2iova function
    - fixed the patch's title
    - OvS PoC code:
      https://github.com/fleitner/ovs/tree/rte_malloc-v3
  V2:
    - Used rte_malloc() instead of another mempool as suggested by Shahaf.
    - Added the documentation section.
    - Using driver registration to negotiate the features.
    - OvS PoC code:
      https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7


diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API functions:
     Enabling this flag should only be done when the calling application does
     not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+    Enabling this flag forces vhost dequeue function to only provide linear
+    pktmbuf (no multi-segmented pktmbuf).
+
+    The vhost library by default provides a single pktmbuf for given a
+    packet, but if for some reason the data doesn't fit into a single
+    pktmbuf (e.g., TSO is enabled), the library will allocate additional
+    pktmbufs from the same mempool and chain them together to create a
+    multi-segmented pktmbuf.
+
+    However, the vhost application needs to support multi-segmented format.
+    If the vhost application does not support that format and requires large
+    buffers to be dequeue, this flag should be enabled to force only linear
+    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+    It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+    Enabling this flag allows vhost dequeue function to allocate and attach
+    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+    space to store all data.
+
+    This is useful when the vhost application wants to support large packets
+    but doesn't want to increase the default mempool object size nor to
+    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+    is allocated using rte_malloc() which gets attached to a pktmbuf using
+    rte_pktmbuf_attach_extbuf().
+
+    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+    mbufs for application that doesn't support chained mbufs.
+
+    It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..e546be2a8 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_socket {
 	bool dequeue_zero_copy;
 	bool iommu_support;
 	bool use_builtin_virtio_net;
+	bool extbuf;
+	bool linearbuf;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 	if (vsocket->dequeue_zero_copy)
 		vhost_enable_dequeue_zero_copy(vid);
 
+	if (vsocket->extbuf)
+		vhost_enable_extbuf(vid);
+
+	if (vsocket->linearbuf)
+		vhost_enable_linearbuf(vid);
+
 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 		goto out_free;
 	}
 	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
+	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
 
 	/*
 	 * Set the supported features correctly for the builtin vhost-user
@@ -894,6 +904,18 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 	 * not compatible with postcopy.
 	 */
 	if (vsocket->dequeue_zero_copy) {
+		if (vsocket->extbuf) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+			"error: zero copy is incompatible with external buffers\n");
+			ret = -1;
+			goto out_free;
+		}
+		if (vsocket->linearbuf) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+			"error: zero copy is incompatible with linear buffers\n");
+			ret = -1;
+			goto out_free;
+		}
 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
 		vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
 
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index cea44df8c..77457f538 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
 		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
 }
 
+void
+vhost_enable_extbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->extbuf = 1;
+}
+
+void
+vhost_enable_linearbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->linearbuf = 1;
+}
+
 int
 rte_vhost_get_mtu(int vid, uint16_t *mtu)
 {
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 5131a97a3..0346bd118 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -302,6 +302,8 @@ struct virtio_net {
 	rte_atomic16_t		broadcast_rarp;
 	uint32_t		nr_vring;
 	int			dequeue_zero_copy;
+	int			extbuf;
+	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
 	char			ifname[IF_NAME_SZ];
@@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
 void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
 void vhost_enable_dequeue_zero_copy(int vid);
 void vhost_set_builtin_virtio_net(int vid, bool enable);
+void vhost_enable_extbuf(int vid);
+void vhost_enable_linearbuf(int vid);
 
 struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..b84a72748 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1289,6 +1289,92 @@ get_zmbuf(struct vhost_virtqueue *vq)
 	return NULL;
 }
 
+static void
+virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
+{
+	rte_free(opaque);
+}
+
+static int
+virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint16_t size)
+{
+	struct rte_mbuf_ext_shared_info *shinfo = NULL;
+	uint16_t buf_len = 0;
+	rte_iova_t iova;
+	void *buf;
+
+	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
+	 * required, otherwise store shinfo in the new buffer.
+	 */
+	if (rte_pktmbuf_tailroom(pkt) > sizeof(*shinfo))
+		shinfo = rte_pktmbuf_mtod(pkt,
+					  struct rte_mbuf_ext_shared_info *);
+	else
+		buf_len += sizeof(*shinfo);
+
+	if (unlikely(buf_len + size + RTE_PKTMBUF_HEADROOM > UINT16_MAX)) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"buffer size %d exceeded maximum.\n", buf_len);
+		return -ENOSPC;
+	}
+
+	buf_len += size + RTE_PKTMBUF_HEADROOM;
+	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
+	if (unlikely(buf == NULL))
+		return -ENOMEM;
+
+	/* initialize shinfo */
+	if (shinfo) {
+		shinfo->free_cb = virtio_dev_extbuf_free;
+		shinfo->fcb_opaque = buf;
+		rte_mbuf_ext_refcnt_set(shinfo, 1);
+	} else {
+		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
+					      virtio_dev_extbuf_free, buf);
+		if (unlikely(shinfo == NULL)) {
+			RTE_LOG(ERR, VHOST_DATA, "Failed to init shinfo\n");
+			return -1;
+		}
+	}
+
+	iova = rte_malloc_virt2iova(buf);
+	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
+	rte_pktmbuf_reset_headroom(pkt);
+
+	return 0;
+}
+
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint16_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL))
+		return NULL;
+
+	if (rte_pktmbuf_tailroom(pkt) >= data_len)
+		return pkt;
+
+	/* attach an external buffer if supported */
+	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
+		return pkt;
+
+	/* check if chained buffers are allowed */
+	if (!dev->linearbuf)
+		return pkt;
+
+	/* Data doesn't fit into the buffer and the host supports
+	 * only linear buffers
+	 */
+	rte_pktmbuf_free(pkt);
+
+	return NULL;
+}
+
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
@@ -1343,26 +1429,23 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t head_idx;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_split(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len,
+						&head_idx, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))
 			update_shadow_used_ring_split(vq, head_idx, 0);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkts[i] == NULL))
 			break;
-		}
 
 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
 				mbuf_pool);
@@ -1451,14 +1534,14 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t buf_id;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t desc_count, nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_packed(dev, vq,
 						vq->last_avail_idx, &desc_count,
 						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
+						&buf_id, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
@@ -1466,12 +1549,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			update_shadow_used_ring_packed(vq, buf_id, 0,
 					desc_count);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkts[i] == NULL))
 			break;
-		}
 
 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
 				mbuf_pool);
-- 
2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: add support for large buffers
  2019-10-11 17:09   ` [dpdk-dev] [PATCH v3] " Flavio Leitner
@ 2019-10-14  2:44     ` Tiwei Bie
  2019-10-15 16:17     ` [dpdk-dev] [PATCH v4] " Flavio Leitner
  1 sibling, 0 replies; 32+ messages in thread
From: Tiwei Bie @ 2019-10-14  2:44 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: dev, Ilya Maximets, Maxime Coquelin, Shahaf Shuler,
	David Marchand, Obrembski MichalX, Stokes Ian

On Fri, Oct 11, 2019 at 02:09:47PM -0300, Flavio Leitner wrote:
> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> If the data fits into a buffer, then all data is copied and a
> single linear buffer is returned. Otherwise it allocates
> additional mbufs and chains them together to return a multiple
> segments mbuf.
> 
> While that covers most use cases, it forces applications that
> need to work with larger data sizes to support multiple segments
> mbufs. The non-linear characteristic brings complexity and
> performance implications to the application.
> 
> To resolve the issue, add support to attach external buffer
> to a pktmbuf and let the host provide during registration if
> attaching an external buffer to pktmbuf is supported and if
> only linear buffer are supported.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>  lib/librte_vhost/rte_vhost.h        |   4 ++
>  lib/librte_vhost/socket.c           |  22 ++++++
>  lib/librte_vhost/vhost.c            |  22 ++++++
>  lib/librte_vhost/vhost.h            |   4 ++
>  lib/librte_vhost/virtio_net.c       | 108 ++++++++++++++++++++++++----
>  6 files changed, 181 insertions(+), 14 deletions(-)

Thanks for the new version!
Overall looks good to me, few minor comments inlined.


> +static int
> +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint16_t size)
> +{
> +	struct rte_mbuf_ext_shared_info *shinfo = NULL;
> +	uint16_t buf_len = 0;
> +	rte_iova_t iova;
> +	void *buf;
> +
> +	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
> +	 * required, otherwise store shinfo in the new buffer.
> +	 */
> +	if (rte_pktmbuf_tailroom(pkt) > sizeof(*shinfo))

How about "rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo)"?

> +		shinfo = rte_pktmbuf_mtod(pkt,
> +					  struct rte_mbuf_ext_shared_info *);
> +	else
> +		buf_len += sizeof(*shinfo);

The space taken by rte_pktmbuf_ext_shinfo_init_helper() may
be bigger than this depending on buf_end.

https://github.com/DPDK/dpdk/blob/cc8627bc6d7a7bd3ccc2653b746aac4fbaa0bc50/lib/librte_mbuf/rte_mbuf.h#L1585-L1589

> +
> +	if (unlikely(buf_len + size + RTE_PKTMBUF_HEADROOM > UINT16_MAX)) {
> +		RTE_LOG(ERR, VHOST_DATA,

We may have fallback available when extbuf alloc fails,
logging this as error may be too noisy.

> +			"buffer size %d exceeded maximum.\n", buf_len);
> +		return -ENOSPC;
> +	}
> +
> +	buf_len += size + RTE_PKTMBUF_HEADROOM;
> +	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> +	if (unlikely(buf == NULL))
> +		return -ENOMEM;
> +
> +	/* initialize shinfo */
> +	if (shinfo) {
> +		shinfo->free_cb = virtio_dev_extbuf_free;
> +		shinfo->fcb_opaque = buf;
> +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> +	} else {
> +		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> +					      virtio_dev_extbuf_free, buf);
> +		if (unlikely(shinfo == NULL)) {
> +			RTE_LOG(ERR, VHOST_DATA, "Failed to init shinfo\n");
> +			return -1;

buf should be freed.


> @@ -1343,26 +1429,23 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t head_idx;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t nr_vec = 0;
>  		int err;
>  
>  		if (unlikely(fill_vec_buf_split(dev, vq,
>  						vq->last_avail_idx + i,
>  						&nr_vec, buf_vec,
> -						&head_idx, &dummy_len,
> +						&head_idx, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
>  
>  		if (likely(dev->dequeue_zero_copy == 0))
>  			update_shadow_used_ring_split(vq, head_idx, 0);
>  
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> -		if (unlikely(pkts[i] == NULL)) {
> -			RTE_LOG(ERR, VHOST_DATA,
> -				"Failed to allocate memory for mbuf.\n");
> +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);

The len in descriptor set by guests is 32bit, we may not
want to trim it to 16bit unconditionally like this.

> +		if (unlikely(pkts[i] == NULL))
>  			break;
> -		}
>  
>  		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
>  				mbuf_pool);
> @@ -1451,14 +1534,14 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	for (i = 0; i < count; i++) {
>  		struct buf_vector buf_vec[BUF_VECTOR_MAX];
>  		uint16_t buf_id;
> -		uint32_t dummy_len;
> +		uint32_t buf_len;
>  		uint16_t desc_count, nr_vec = 0;
>  		int err;
>  
>  		if (unlikely(fill_vec_buf_packed(dev, vq,
>  						vq->last_avail_idx, &desc_count,
>  						buf_vec, &nr_vec,
> -						&buf_id, &dummy_len,
> +						&buf_id, &buf_len,
>  						VHOST_ACCESS_RO) < 0))
>  			break;
>  
> @@ -1466,12 +1549,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			update_shadow_used_ring_packed(vq, buf_id, 0,
>  					desc_count);
>  
> -		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> -		if (unlikely(pkts[i] == NULL)) {
> -			RTE_LOG(ERR, VHOST_DATA,
> -				"Failed to allocate memory for mbuf.\n");
> +		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);

Ditto.

> +		if (unlikely(pkts[i] == NULL))
>  			break;
> -		}
>  
>  		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
>  				mbuf_pool);
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v4] vhost: add support for large buffers
  2019-10-11 17:09   ` [dpdk-dev] [PATCH v3] " Flavio Leitner
  2019-10-14  2:44     ` Tiwei Bie
@ 2019-10-15 16:17     ` Flavio Leitner
  2019-10-15 17:41       ` Ilya Maximets
  2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
  1 sibling, 2 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-15 16:17 UTC (permalink / raw)
  To: dev
  Cc: Ilya Maximets, Maxime Coquelin, Shahaf Shuler, David Marchand,
	Tiwei Bie, Obrembski MichalX, Stokes Ian

The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, add support to attach external buffer
to a pktmbuf and let the host provide during registration if
attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner <fbl@sysclose.org>
---
 doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
 lib/librte_vhost/rte_vhost.h        |   4 +
 lib/librte_vhost/socket.c           |  22 ++++++
 lib/librte_vhost/vhost.c            |  22 ++++++
 lib/librte_vhost/vhost.h            |   4 +
 lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
 6 files changed, 182 insertions(+), 14 deletions(-)


- Changelog:
  v4:
    - allow to use pktmbuf if there is exact space
    - removed log message if the buffer is too big
    - fixed the length to include align padding
    - free allocated buf if shinfo fails
  v3:
    - prevent the new features to be used with zero copy
    - fixed sizeof() usage
    - fixed log msg indentation
    - removed/replaced asserts
    - used the correct virt2iova function
    - fixed the patch's title
    - OvS PoC code:
      https://github.com/fleitner/ovs/tree/rte_malloc-v3
  v2:
    - Used rte_malloc() instead of another mempool as suggested by Shahaf.
    - Added the documentation section.
    - Using driver registration to negotiate the features.
    - OvS PoC code:
      https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7



diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API functions:
     Enabling this flag should only be done when the calling application does
     not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+    Enabling this flag forces vhost dequeue function to only provide linear
+    pktmbuf (no multi-segmented pktmbuf).
+
+    The vhost library by default provides a single pktmbuf for given a
+    packet, but if for some reason the data doesn't fit into a single
+    pktmbuf (e.g., TSO is enabled), the library will allocate additional
+    pktmbufs from the same mempool and chain them together to create a
+    multi-segmented pktmbuf.
+
+    However, the vhost application needs to support multi-segmented format.
+    If the vhost application does not support that format and requires large
+    buffers to be dequeue, this flag should be enabled to force only linear
+    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+    It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+    Enabling this flag allows vhost dequeue function to allocate and attach
+    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+    space to store all data.
+
+    This is useful when the vhost application wants to support large packets
+    but doesn't want to increase the default mempool object size nor to
+    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+    is allocated using rte_malloc() which gets attached to a pktmbuf using
+    rte_pktmbuf_attach_extbuf().
+
+    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+    mbufs for application that doesn't support chained mbufs.
+
+    It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..e546be2a8 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_socket {
 	bool dequeue_zero_copy;
 	bool iommu_support;
 	bool use_builtin_virtio_net;
+	bool extbuf;
+	bool linearbuf;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 	if (vsocket->dequeue_zero_copy)
 		vhost_enable_dequeue_zero_copy(vid);
 
+	if (vsocket->extbuf)
+		vhost_enable_extbuf(vid);
+
+	if (vsocket->linearbuf)
+		vhost_enable_linearbuf(vid);
+
 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 		goto out_free;
 	}
 	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
+	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
 
 	/*
 	 * Set the supported features correctly for the builtin vhost-user
@@ -894,6 +904,18 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 	 * not compatible with postcopy.
 	 */
 	if (vsocket->dequeue_zero_copy) {
+		if (vsocket->extbuf) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+			"error: zero copy is incompatible with external buffers\n");
+			ret = -1;
+			goto out_free;
+		}
+		if (vsocket->linearbuf) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+			"error: zero copy is incompatible with linear buffers\n");
+			ret = -1;
+			goto out_free;
+		}
 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
 		vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
 
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index cea44df8c..77457f538 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
 		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
 }
 
+void
+vhost_enable_extbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->extbuf = 1;
+}
+
+void
+vhost_enable_linearbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->linearbuf = 1;
+}
+
 int
 rte_vhost_get_mtu(int vid, uint16_t *mtu)
 {
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 5131a97a3..0346bd118 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -302,6 +302,8 @@ struct virtio_net {
 	rte_atomic16_t		broadcast_rarp;
 	uint32_t		nr_vring;
 	int			dequeue_zero_copy;
+	int			extbuf;
+	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
 	char			ifname[IF_NAME_SZ];
@@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
 void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
 void vhost_enable_dequeue_zero_copy(int vid);
 void vhost_set_builtin_virtio_net(int vid, bool enable);
+void vhost_enable_extbuf(int vid);
+void vhost_enable_linearbuf(int vid);
 
 struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..da69ab1db 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1289,6 +1289,93 @@ get_zmbuf(struct vhost_virtqueue *vq)
 	return NULL;
 }
 
+static void
+virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
+{
+	rte_free(opaque);
+}
+
+static int
+virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
+{
+	struct rte_mbuf_ext_shared_info *shinfo = NULL;
+	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
+	uint16_t buf_len;
+	rte_iova_t iova;
+	void *buf;
+
+	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
+	 * required, otherwise store shinfo in the new buffer.
+	 */
+	if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
+		shinfo = rte_pktmbuf_mtod(pkt,
+					  struct rte_mbuf_ext_shared_info *);
+	else {
+		total_len += sizeof(*shinfo) + sizeof(uintptr_t);
+		total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
+	}
+
+	if (unlikely(total_len > UINT16_MAX))
+		return -ENOSPC;
+
+	buf_len = total_len;
+	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
+	if (unlikely(buf == NULL))
+		return -ENOMEM;
+
+	/* Initialize shinfo */
+	if (shinfo) {
+		shinfo->free_cb = virtio_dev_extbuf_free;
+		shinfo->fcb_opaque = buf;
+		rte_mbuf_ext_refcnt_set(shinfo, 1);
+	} else {
+		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
+					      virtio_dev_extbuf_free, buf);
+		if (unlikely(shinfo == NULL)) {
+			rte_free(buf);
+			RTE_LOG(ERR, VHOST_DATA, "Failed to init shinfo\n");
+			return -1;
+		}
+	}
+
+	iova = rte_malloc_virt2iova(buf);
+	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
+	rte_pktmbuf_reset_headroom(pkt);
+
+	return 0;
+}
+
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint32_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL))
+		return NULL;
+
+	if (rte_pktmbuf_tailroom(pkt) >= data_len)
+		return pkt;
+
+	/* attach an external buffer if supported */
+	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
+		return pkt;
+
+	/* check if chained buffers are allowed */
+	if (!dev->linearbuf)
+		return pkt;
+
+	/* Data doesn't fit into the buffer and the host supports
+	 * only linear buffers
+	 */
+	rte_pktmbuf_free(pkt);
+
+	return NULL;
+}
+
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
@@ -1343,26 +1430,23 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t head_idx;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_split(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len,
+						&head_idx, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))
 			update_shadow_used_ring_split(vq, head_idx, 0);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkts[i] == NULL))
 			break;
-		}
 
 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
 				mbuf_pool);
@@ -1451,14 +1535,14 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t buf_id;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t desc_count, nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_packed(dev, vq,
 						vq->last_avail_idx, &desc_count,
 						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
+						&buf_id, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
@@ -1466,12 +1550,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			update_shadow_used_ring_packed(vq, buf_id, 0,
 					desc_count);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkts[i] == NULL))
 			break;
-		}
 
 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
 				mbuf_pool);
-- 
2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vhost: add support for large buffers
  2019-10-15 16:17     ` [dpdk-dev] [PATCH v4] " Flavio Leitner
@ 2019-10-15 17:41       ` Ilya Maximets
  2019-10-15 18:44         ` Flavio Leitner
  2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
  1 sibling, 1 reply; 32+ messages in thread
From: Ilya Maximets @ 2019-10-15 17:41 UTC (permalink / raw)
  To: Flavio Leitner, dev
  Cc: Ilya Maximets, Maxime Coquelin, Shahaf Shuler, David Marchand,
	Tiwei Bie, Obrembski MichalX, Stokes Ian

Hi.
Not a full review.  Few comments inline.

Best regards, Ilya Maximets.

On 15.10.2019 18:17, Flavio Leitner wrote:
> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> If the data fits into a buffer, then all data is copied and a
> single linear buffer is returned. Otherwise it allocates
> additional mbufs and chains them together to return a multiple
> segments mbuf.
> 
> While that covers most use cases, it forces applications that
> need to work with larger data sizes to support multiple segments
> mbufs. The non-linear characteristic brings complexity and
> performance implications to the application.
> 
> To resolve the issue, add support to attach external buffer
> to a pktmbuf and let the host provide during registration if
> attaching an external buffer to pktmbuf is supported and if
> only linear buffer are supported.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>   doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>   lib/librte_vhost/rte_vhost.h        |   4 +
>   lib/librte_vhost/socket.c           |  22 ++++++
>   lib/librte_vhost/vhost.c            |  22 ++++++
>   lib/librte_vhost/vhost.h            |   4 +
>   lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
>   6 files changed, 182 insertions(+), 14 deletions(-)
> 
> 
> - Changelog:
>    v4:
>      - allow to use pktmbuf if there is exact space
>      - removed log message if the buffer is too big
>      - fixed the length to include align padding
>      - free allocated buf if shinfo fails
>    v3:
>      - prevent the new features to be used with zero copy
>      - fixed sizeof() usage
>      - fixed log msg indentation
>      - removed/replaced asserts
>      - used the correct virt2iova function
>      - fixed the patch's title
>      - OvS PoC code:
>        https://github.com/fleitner/ovs/tree/rte_malloc-v3
>    v2:
>      - Used rte_malloc() instead of another mempool as suggested by Shahaf.
>      - Added the documentation section.
>      - Using driver registration to negotiate the features.
>      - OvS PoC code:
>        https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> 
> 
> 
> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index fc3ee4353..07e40e3c5 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -117,6 +117,41 @@ The following is an overview of some key Vhost API functions:
>       Enabling this flag should only be done when the calling application does
>       not pre-fault the guest shared memory, otherwise migration would fail.
>   
> +  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
> +
> +    Enabling this flag forces vhost dequeue function to only provide linear
> +    pktmbuf (no multi-segmented pktmbuf).
> +
> +    The vhost library by default provides a single pktmbuf for given a
> +    packet, but if for some reason the data doesn't fit into a single
> +    pktmbuf (e.g., TSO is enabled), the library will allocate additional
> +    pktmbufs from the same mempool and chain them together to create a
> +    multi-segmented pktmbuf.
> +
> +    However, the vhost application needs to support multi-segmented format.
> +    If the vhost application does not support that format and requires large
> +    buffers to be dequeue, this flag should be enabled to force only linear
> +    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
> +
> +    It is disabled by default.
> +
> +  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
> +
> +    Enabling this flag allows vhost dequeue function to allocate and attach
> +    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
> +    space to store all data.
> +
> +    This is useful when the vhost application wants to support large packets
> +    but doesn't want to increase the default mempool object size nor to
> +    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
> +    is allocated using rte_malloc() which gets attached to a pktmbuf using
> +    rte_pktmbuf_attach_extbuf().
> +
> +    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
> +    mbufs for application that doesn't support chained mbufs.
> +
> +    It is disabled by default.
> +
>   * ``rte_vhost_driver_set_features(path, features)``
>   
>     This function sets the feature bits the vhost-user driver supports. The
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index 19474bca0..b821b5df4 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -30,6 +30,10 @@ extern "C" {
>   #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
>   #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
>   #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
> +/* support mbuf with external buffer attached */
> +#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
> +/* support only linear buffers (no chained mbufs) */
> +#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
>   
>   /** Protocol features. */
>   #ifndef VHOST_USER_PROTOCOL_F_MQ
> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
> index 274988c4d..e546be2a8 100644
> --- a/lib/librte_vhost/socket.c
> +++ b/lib/librte_vhost/socket.c
> @@ -40,6 +40,8 @@ struct vhost_user_socket {
>   	bool dequeue_zero_copy;
>   	bool iommu_support;
>   	bool use_builtin_virtio_net;
> +	bool extbuf;
> +	bool linearbuf;
>   
>   	/*
>   	 * The "supported_features" indicates the feature bits the
> @@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
>   	if (vsocket->dequeue_zero_copy)
>   		vhost_enable_dequeue_zero_copy(vid);
>   
> +	if (vsocket->extbuf)
> +		vhost_enable_extbuf(vid);
> +
> +	if (vsocket->linearbuf)
> +		vhost_enable_linearbuf(vid);
> +
>   	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
>   
>   	if (vsocket->notify_ops->new_connection) {
> @@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
>   		goto out_free;
>   	}
>   	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> +	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> +	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
>   
>   	/*
>   	 * Set the supported features correctly for the builtin vhost-user
> @@ -894,6 +904,18 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
>   	 * not compatible with postcopy.
>   	 */
>   	if (vsocket->dequeue_zero_copy) {
> +		if (vsocket->extbuf) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +			"error: zero copy is incompatible with external buffers\n");
> +			ret = -1;
> +			goto out_free;

There should be 'out_mutex'.

> +		}
> +		if (vsocket->linearbuf) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +			"error: zero copy is incompatible with linear buffers\n");
> +			ret = -1;
> +			goto out_free;

Ditto.

> +		}
>   		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
>   		vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
>   
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index cea44df8c..77457f538 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
>   		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
>   }
>   
> +void
> +vhost_enable_extbuf(int vid)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return;
> +
> +	dev->extbuf = 1;
> +}
> +
> +void
> +vhost_enable_linearbuf(int vid)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return;
> +
> +	dev->linearbuf = 1;
> +}
> +
>   int
>   rte_vhost_get_mtu(int vid, uint16_t *mtu)
>   {
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 5131a97a3..0346bd118 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -302,6 +302,8 @@ struct virtio_net {
>   	rte_atomic16_t		broadcast_rarp;
>   	uint32_t		nr_vring;
>   	int			dequeue_zero_copy;
> +	int			extbuf;
> +	int			linearbuf;
>   	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>   #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>   	char			ifname[IF_NAME_SZ];
> @@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
>   void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
>   void vhost_enable_dequeue_zero_copy(int vid);
>   void vhost_set_builtin_virtio_net(int vid, bool enable);
> +void vhost_enable_extbuf(int vid);
> +void vhost_enable_linearbuf(int vid);
>   
>   struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
>   
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5b85b832d..da69ab1db 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1289,6 +1289,93 @@ get_zmbuf(struct vhost_virtqueue *vq)
>   	return NULL;
>   }
>   
> +static void
> +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
> +{
> +	rte_free(opaque);
> +}
> +
> +static int
> +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
> +{
> +	struct rte_mbuf_ext_shared_info *shinfo = NULL;
> +	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
> +	uint16_t buf_len;
> +	rte_iova_t iova;
> +	void *buf;
> +
> +	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
> +	 * required, otherwise store shinfo in the new buffer.
> +	 */
> +	if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
> +		shinfo = rte_pktmbuf_mtod(pkt,
> +					  struct rte_mbuf_ext_shared_info *);
> +	else {
> +		total_len += sizeof(*shinfo) + sizeof(uintptr_t);
> +		total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
> +	}
> +
> +	if (unlikely(total_len > UINT16_MAX))
> +		return -ENOSPC;
> +
> +	buf_len = total_len;
> +	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> +	if (unlikely(buf == NULL))
> +		return -ENOMEM;
> +
> +	/* Initialize shinfo */
> +	if (shinfo) {
> +		shinfo->free_cb = virtio_dev_extbuf_free;
> +		shinfo->fcb_opaque = buf;
> +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> +	} else {
> +		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> +					      virtio_dev_extbuf_free, buf);
> +		if (unlikely(shinfo == NULL)) {
> +			rte_free(buf);
> +			RTE_LOG(ERR, VHOST_DATA, "Failed to init shinfo\n");
> +			return -1;
> +		}
> +	}
> +
> +	iova = rte_malloc_virt2iova(buf);
> +	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
> +	rte_pktmbuf_reset_headroom(pkt);
> +
> +	return 0;
> +}
> +
> +/*
> + * Allocate a host supported pktmbuf.
> + */
> +static __rte_always_inline struct rte_mbuf *
> +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
> +			 uint32_t data_len)
> +{
> +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> +
> +	if (unlikely(pkt == NULL))
> +		return NULL;
> +
> +	if (rte_pktmbuf_tailroom(pkt) >= data_len)
> +		return pkt;
> +
> +	/* attach an external buffer if supported */
> +	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
> +		return pkt;
> +
> +	/* check if chained buffers are allowed */
> +	if (!dev->linearbuf)
> +		return pkt;

I guess, checking of the 'linearbuf' should go before checking the 'extbuf'.
The usecase is that allocation of several buffers from the memory pool is,
probably, faster than rte_malloc() + memory attaching.  So, if the 'linearbuf'
is not requested, it might be faster to use chained mbufs.

BTW, I'm not sure if we really need 2 separate options for this.
i.e. 1. +linear +extbuf --> extbuf allocated
      2. +linear -extbuf --> buffer dropped (is this case is useful at all?)
      3. -linear +extbuf --> chained mbufs might be preferred (see above)
      4. -linear -extbuf --> chained mbufs

Case 4 is a default case. Case 1 is our target case for supporting large buffers.
Case 3 might makes no much sense as the result is equal to case 4.
Case 2 might be not interesting for as at all, because it will lead to random
packet drops depending on their size.

But, if only cases 1 and 4 are valid and interesting to us, we could easily merge
both flags.

Thoughts?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vhost: add support for large buffers
  2019-10-15 17:41       ` Ilya Maximets
@ 2019-10-15 18:44         ` Flavio Leitner
  0 siblings, 0 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-15 18:44 UTC (permalink / raw)
  To: Ilya Maximets
  Cc: dev, Maxime Coquelin, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On Tue, 15 Oct 2019 19:41:52 +0200
Ilya Maximets <i.maximets@ovn.org> wrote:

> Hi.
> Not a full review.  Few comments inline.
> 
> Best regards, Ilya Maximets.

Thanks for reviewing Ilya, see below.

[...]
> > @@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path,
> > uint64_t flags) goto out_free;
> >   	}
> >   	vsocket->dequeue_zero_copy = flags &
> > RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> > +	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> > +	vsocket->linearbuf = flags &
> > RTE_VHOST_USER_LINEARBUF_SUPPORT; 
> >   	/*
> >   	 * Set the supported features correctly for the builtin
> > vhost-user @@ -894,6 +904,18 @@ rte_vhost_driver_register(const
> > char *path, uint64_t flags)
> >   	 * not compatible with postcopy.
> >   	 */
> >   	if (vsocket->dequeue_zero_copy) {
> > +		if (vsocket->extbuf) {
> > +			RTE_LOG(ERR, VHOST_CONFIG,
> > +			"error: zero copy is incompatible with
> > external buffers\n");
> > +			ret = -1;
> > +			goto out_free;  
> 
> There should be 'out_mutex'.

Good catch, going to fix that.

> 
> > +		}
> > +		if (vsocket->linearbuf) {
> > +			RTE_LOG(ERR, VHOST_CONFIG,
> > +			"error: zero copy is incompatible with
> > linear buffers\n");
> > +			ret = -1;
> > +			goto out_free;  
> 
> Ditto.

Yeah

[...]
> > +/*
> > + * Allocate a host supported pktmbuf.
> > + */
> > +static __rte_always_inline struct rte_mbuf *
> > +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct
> > rte_mempool *mp,
> > +			 uint32_t data_len)
> > +{
> > +	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> > +
> > +	if (unlikely(pkt == NULL))
> > +		return NULL;
> > +
> > +	if (rte_pktmbuf_tailroom(pkt) >= data_len)
> > +		return pkt;
> > +
> > +	/* attach an external buffer if supported */
> > +	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
> > +		return pkt;
> > +
> > +	/* check if chained buffers are allowed */
> > +	if (!dev->linearbuf)
> > +		return pkt;  
> 
> I guess, checking of the 'linearbuf' should go before checking the
> 'extbuf'. The usecase is that allocation of several buffers from the
> memory pool is, probably, faster than rte_malloc() + memory
> attaching.  So, if the 'linearbuf' is not requested, it might be
> faster to use chained mbufs.

The speed seems to depend on the length of the packet and it is not
just the allocation, but the overall overhead to deal with multiple
segments. If the preferred way is chained buffers, then just don't
pass any of the new flags because that's the default behavior today.
 
> BTW, I'm not sure if we really need 2 separate options for this.
> i.e. 1. +linear +extbuf --> extbuf allocated
>       2. +linear -extbuf --> buffer dropped (is this case is useful
> at all?) 3. -linear +extbuf --> chained mbufs might be preferred (see
> above) 4. -linear -extbuf --> chained mbufs
> 
> Case 4 is a default case. 

Yes, in this case if the packet length fits into the pktmbuf, a single
linear mbuf is returned otherwise it uses chained mbufs.

>  Case 1 is our target case for supporting
> large buffers. 

Right, here it enables external buffers, but we don't want chained
mbufs.

> Case 3 might makes no much sense as the result is
> equal to case 4.

Well case #3 either uses a single mbuf from mpool or use an external
buffer for big packets while in case #4 there is no support for big
packets at all.

> Case 2 might be not interesting for as at all,

Case #2 says the app only supports what fits into a single mbuf and
no external buffer nor chained buffers is supported.

> because it will lead to random packet drops depending on their size.

Either that or wrong processing.

> But, if only cases 1 and 4 are valid and interesting to us, we could
> easily merge both flags.

For OvS use-case today yes, but I created different flags because they
seemed different useful features to me and changing flags later can get
the API very confusing.

Thanks!
fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-15 16:17     ` [dpdk-dev] [PATCH v4] " Flavio Leitner
  2019-10-15 17:41       ` Ilya Maximets
@ 2019-10-15 18:59       ` Flavio Leitner
  2019-10-16 10:02         ` Maxime Coquelin
                           ` (2 more replies)
  1 sibling, 3 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-15 18:59 UTC (permalink / raw)
  To: dev
  Cc: Ilya Maximets, Maxime Coquelin, Shahaf Shuler, David Marchand,
	Tiwei Bie, Obrembski MichalX, Stokes Ian

The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, add support to attach external buffer
to a pktmbuf and let the host provide during registration if
attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner <fbl@sysclose.org>
---
 doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
 lib/librte_vhost/rte_vhost.h        |   4 +
 lib/librte_vhost/socket.c           |  22 ++++++
 lib/librte_vhost/vhost.c            |  22 ++++++
 lib/librte_vhost/vhost.h            |   4 +
 lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
 6 files changed, 182 insertions(+), 14 deletions(-)

- Changelog:
  v5:
    - fixed to destroy mutex if incompatible flags
  v4:
    - allow to use pktmbuf if there is exact space
    - removed log message if the buffer is too big
    - fixed the length to include align padding
    - free allocated buf if shinfo fails
  v3:
    - prevent the new features to be used with zero copy
    - fixed sizeof() usage
    - fixed log msg indentation
    - removed/replaced asserts
    - used the correct virt2iova function
    - fixed the patch's title
    - OvS PoC code:
      https://github.com/fleitner/ovs/tree/rte_malloc-v3
  v2:
    - Used rte_malloc() instead of another mempool as suggested by Shahaf.
    - Added the documentation section.
    - Using driver registration to negotiate the features.
    - OvS PoC code:
      https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API functions:
     Enabling this flag should only be done when the calling application does
     not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+    Enabling this flag forces vhost dequeue function to only provide linear
+    pktmbuf (no multi-segmented pktmbuf).
+
+    The vhost library by default provides a single pktmbuf for given a
+    packet, but if for some reason the data doesn't fit into a single
+    pktmbuf (e.g., TSO is enabled), the library will allocate additional
+    pktmbufs from the same mempool and chain them together to create a
+    multi-segmented pktmbuf.
+
+    However, the vhost application needs to support multi-segmented format.
+    If the vhost application does not support that format and requires large
+    buffers to be dequeue, this flag should be enabled to force only linear
+    buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+    It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+    Enabling this flag allows vhost dequeue function to allocate and attach
+    an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+    space to store all data.
+
+    This is useful when the vhost application wants to support large packets
+    but doesn't want to increase the default mempool object size nor to
+    support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+    is allocated using rte_malloc() which gets attached to a pktmbuf using
+    rte_pktmbuf_attach_extbuf().
+
+    See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+    mbufs for application that doesn't support chained mbufs.
+
+    It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT		(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..a8d433c49 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_socket {
 	bool dequeue_zero_copy;
 	bool iommu_support;
 	bool use_builtin_virtio_net;
+	bool extbuf;
+	bool linearbuf;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 	if (vsocket->dequeue_zero_copy)
 		vhost_enable_dequeue_zero_copy(vid);
 
+	if (vsocket->extbuf)
+		vhost_enable_extbuf(vid);
+
+	if (vsocket->linearbuf)
+		vhost_enable_linearbuf(vid);
+
 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 		goto out_free;
 	}
 	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
+	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
 
 	/*
 	 * Set the supported features correctly for the builtin vhost-user
@@ -894,6 +904,18 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 	 * not compatible with postcopy.
 	 */
 	if (vsocket->dequeue_zero_copy) {
+		if (vsocket->extbuf) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+			"error: zero copy is incompatible with external buffers\n");
+			ret = -1;
+			goto out_mutex;
+		}
+		if (vsocket->linearbuf) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+			"error: zero copy is incompatible with linear buffers\n");
+			ret = -1;
+			goto out_mutex;
+		}
 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
 		vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
 
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index cea44df8c..77457f538 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -605,6 +605,28 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
 		dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
 }
 
+void
+vhost_enable_extbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->extbuf = 1;
+}
+
+void
+vhost_enable_linearbuf(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->linearbuf = 1;
+}
+
 int
 rte_vhost_get_mtu(int vid, uint16_t *mtu)
 {
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 5131a97a3..0346bd118 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -302,6 +302,8 @@ struct virtio_net {
 	rte_atomic16_t		broadcast_rarp;
 	uint32_t		nr_vring;
 	int			dequeue_zero_copy;
+	int			extbuf;
+	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
 	char			ifname[IF_NAME_SZ];
@@ -476,6 +478,8 @@ void vhost_attach_vdpa_device(int vid, int did);
 void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
 void vhost_enable_dequeue_zero_copy(int vid);
 void vhost_set_builtin_virtio_net(int vid, bool enable);
+void vhost_enable_extbuf(int vid);
+void vhost_enable_linearbuf(int vid);
 
 struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..da69ab1db 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1289,6 +1289,93 @@ get_zmbuf(struct vhost_virtqueue *vq)
 	return NULL;
 }
 
+static void
+virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
+{
+	rte_free(opaque);
+}
+
+static int
+virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
+{
+	struct rte_mbuf_ext_shared_info *shinfo = NULL;
+	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
+	uint16_t buf_len;
+	rte_iova_t iova;
+	void *buf;
+
+	/* Try to use pkt buffer to store shinfo to reduce the amount of memory
+	 * required, otherwise store shinfo in the new buffer.
+	 */
+	if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
+		shinfo = rte_pktmbuf_mtod(pkt,
+					  struct rte_mbuf_ext_shared_info *);
+	else {
+		total_len += sizeof(*shinfo) + sizeof(uintptr_t);
+		total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
+	}
+
+	if (unlikely(total_len > UINT16_MAX))
+		return -ENOSPC;
+
+	buf_len = total_len;
+	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
+	if (unlikely(buf == NULL))
+		return -ENOMEM;
+
+	/* Initialize shinfo */
+	if (shinfo) {
+		shinfo->free_cb = virtio_dev_extbuf_free;
+		shinfo->fcb_opaque = buf;
+		rte_mbuf_ext_refcnt_set(shinfo, 1);
+	} else {
+		shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
+					      virtio_dev_extbuf_free, buf);
+		if (unlikely(shinfo == NULL)) {
+			rte_free(buf);
+			RTE_LOG(ERR, VHOST_DATA, "Failed to init shinfo\n");
+			return -1;
+		}
+	}
+
+	iova = rte_malloc_virt2iova(buf);
+	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
+	rte_pktmbuf_reset_headroom(pkt);
+
+	return 0;
+}
+
+/*
+ * Allocate a host supported pktmbuf.
+ */
+static __rte_always_inline struct rte_mbuf *
+virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
+			 uint32_t data_len)
+{
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
+
+	if (unlikely(pkt == NULL))
+		return NULL;
+
+	if (rte_pktmbuf_tailroom(pkt) >= data_len)
+		return pkt;
+
+	/* attach an external buffer if supported */
+	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
+		return pkt;
+
+	/* check if chained buffers are allowed */
+	if (!dev->linearbuf)
+		return pkt;
+
+	/* Data doesn't fit into the buffer and the host supports
+	 * only linear buffers
+	 */
+	rte_pktmbuf_free(pkt);
+
+	return NULL;
+}
+
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
@@ -1343,26 +1430,23 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t head_idx;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_split(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len,
+						&head_idx, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))
 			update_shadow_used_ring_split(vq, head_idx, 0);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkts[i] == NULL))
 			break;
-		}
 
 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
 				mbuf_pool);
@@ -1451,14 +1535,14 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	for (i = 0; i < count; i++) {
 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
 		uint16_t buf_id;
-		uint32_t dummy_len;
+		uint32_t buf_len;
 		uint16_t desc_count, nr_vec = 0;
 		int err;
 
 		if (unlikely(fill_vec_buf_packed(dev, vq,
 						vq->last_avail_idx, &desc_count,
 						buf_vec, &nr_vec,
-						&buf_id, &dummy_len,
+						&buf_id, &buf_len,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
@@ -1466,12 +1550,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			update_shadow_used_ring_packed(vq, buf_id, 0,
 					desc_count);
 
-		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-		if (unlikely(pkts[i] == NULL)) {
-			RTE_LOG(ERR, VHOST_DATA,
-				"Failed to allocate memory for mbuf.\n");
+		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+		if (unlikely(pkts[i] == NULL))
 			break;
-		}
 
 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
 				mbuf_pool);
-- 
2.20.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
@ 2019-10-16 10:02         ` Maxime Coquelin
  2019-10-16 11:13         ` Maxime Coquelin
  2019-10-29  9:02         ` David Marchand
  2 siblings, 0 replies; 32+ messages in thread
From: Maxime Coquelin @ 2019-10-16 10:02 UTC (permalink / raw)
  To: Flavio Leitner, dev
  Cc: Ilya Maximets, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

Hi Flavio,

On 10/15/19 8:59 PM, Flavio Leitner wrote:
> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> If the data fits into a buffer, then all data is copied and a
> single linear buffer is returned. Otherwise it allocates
> additional mbufs and chains them together to return a multiple
> segments mbuf.
> 
> While that covers most use cases, it forces applications that
> need to work with larger data sizes to support multiple segments
> mbufs. The non-linear characteristic brings complexity and
> performance implications to the application.
> 
> To resolve the issue, add support to attach external buffer
> to a pktmbuf and let the host provide during registration if
> attaching an external buffer to pktmbuf is supported and if
> only linear buffer are supported.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>  lib/librte_vhost/rte_vhost.h        |   4 +
>  lib/librte_vhost/socket.c           |  22 ++++++
>  lib/librte_vhost/vhost.c            |  22 ++++++
>  lib/librte_vhost/vhost.h            |   4 +
>  lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
>  6 files changed, 182 insertions(+), 14 deletions(-)
> 
> - Changelog:
>   v5:
>     - fixed to destroy mutex if incompatible flags
>   v4:
>     - allow to use pktmbuf if there is exact space
>     - removed log message if the buffer is too big
>     - fixed the length to include align padding
>     - free allocated buf if shinfo fails
>   v3:
>     - prevent the new features to be used with zero copy
>     - fixed sizeof() usage
>     - fixed log msg indentation
>     - removed/replaced asserts
>     - used the correct virt2iova function
>     - fixed the patch's title
>     - OvS PoC code:
>       https://github.com/fleitner/ovs/tree/rte_malloc-v3
>   v2:
>     - Used rte_malloc() instead of another mempool as suggested by Shahaf.
>     - Added the documentation section.
>     - Using driver registration to negotiate the features.
>     - OvS PoC code:
>       https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> 

I went through the patch, and it looks good to me.
I appreciate the fact that there is no rte_vhost_dequeue_burst API
change.

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
  2019-10-16 10:02         ` Maxime Coquelin
@ 2019-10-16 11:13         ` Maxime Coquelin
  2019-10-16 13:32           ` Ilya Maximets
  2019-10-29  9:02         ` David Marchand
  2 siblings, 1 reply; 32+ messages in thread
From: Maxime Coquelin @ 2019-10-16 11:13 UTC (permalink / raw)
  To: Flavio Leitner, dev
  Cc: Ilya Maximets, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian



On 10/15/19 8:59 PM, Flavio Leitner wrote:
> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> If the data fits into a buffer, then all data is copied and a
> single linear buffer is returned. Otherwise it allocates
> additional mbufs and chains them together to return a multiple
> segments mbuf.
> 
> While that covers most use cases, it forces applications that
> need to work with larger data sizes to support multiple segments
> mbufs. The non-linear characteristic brings complexity and
> performance implications to the application.
> 
> To resolve the issue, add support to attach external buffer
> to a pktmbuf and let the host provide during registration if
> attaching an external buffer to pktmbuf is supported and if
> only linear buffer are supported.
> 
> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>  lib/librte_vhost/rte_vhost.h        |   4 +
>  lib/librte_vhost/socket.c           |  22 ++++++
>  lib/librte_vhost/vhost.c            |  22 ++++++
>  lib/librte_vhost/vhost.h            |   4 +
>  lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
>  6 files changed, 182 insertions(+), 14 deletions(-)
> 
> - Changelog:
>   v5:
>     - fixed to destroy mutex if incompatible flags
>   v4:
>     - allow to use pktmbuf if there is exact space
>     - removed log message if the buffer is too big
>     - fixed the length to include align padding
>     - free allocated buf if shinfo fails
>   v3:
>     - prevent the new features to be used with zero copy
>     - fixed sizeof() usage
>     - fixed log msg indentation
>     - removed/replaced asserts
>     - used the correct virt2iova function
>     - fixed the patch's title
>     - OvS PoC code:
>       https://github.com/fleitner/ovs/tree/rte_malloc-v3
>   v2:
>     - Used rte_malloc() instead of another mempool as suggested by Shahaf.
>     - Added the documentation section.
>     - Using driver registration to negotiate the features.
>     - OvS PoC code:
>       https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7

Applied to dpdk-next-virtio/master.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-16 11:13         ` Maxime Coquelin
@ 2019-10-16 13:32           ` Ilya Maximets
  2019-10-16 13:46             ` Maxime Coquelin
  0 siblings, 1 reply; 32+ messages in thread
From: Ilya Maximets @ 2019-10-16 13:32 UTC (permalink / raw)
  To: Maxime Coquelin, Flavio Leitner, dev
  Cc: Ilya Maximets, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On 16.10.2019 13:13, Maxime Coquelin wrote:
> 
> 
> On 10/15/19 8:59 PM, Flavio Leitner wrote:
>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
>> If the data fits into a buffer, then all data is copied and a
>> single linear buffer is returned. Otherwise it allocates
>> additional mbufs and chains them together to return a multiple
>> segments mbuf.
>>
>> While that covers most use cases, it forces applications that
>> need to work with larger data sizes to support multiple segments
>> mbufs. The non-linear characteristic brings complexity and
>> performance implications to the application.
>>
>> To resolve the issue, add support to attach external buffer
>> to a pktmbuf and let the host provide during registration if
>> attaching an external buffer to pktmbuf is supported and if
>> only linear buffer are supported.
>>
>> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
>> ---
>>   doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>>   lib/librte_vhost/rte_vhost.h        |   4 +
>>   lib/librte_vhost/socket.c           |  22 ++++++
>>   lib/librte_vhost/vhost.c            |  22 ++++++
>>   lib/librte_vhost/vhost.h            |   4 +
>>   lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
>>   6 files changed, 182 insertions(+), 14 deletions(-)
>>
>> - Changelog:
>>    v5:
>>      - fixed to destroy mutex if incompatible flags
>>    v4:
>>      - allow to use pktmbuf if there is exact space
>>      - removed log message if the buffer is too big
>>      - fixed the length to include align padding
>>      - free allocated buf if shinfo fails
>>    v3:
>>      - prevent the new features to be used with zero copy
>>      - fixed sizeof() usage
>>      - fixed log msg indentation
>>      - removed/replaced asserts
>>      - used the correct virt2iova function
>>      - fixed the patch's title
>>      - OvS PoC code:
>>        https://github.com/fleitner/ovs/tree/rte_malloc-v3
>>    v2:
>>      - Used rte_malloc() instead of another mempool as suggested by Shahaf.
>>      - Added the documentation section.
>>      - Using driver registration to negotiate the features.
>>      - OvS PoC code:
>>        https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> 
> Applied to dpdk-next-virtio/master.

Thanks Maxime,

But can we return back the mbuf allocation failure message?

I mean:

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 66f0c7206..f8af4e0b3 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
  {
  	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
  
-	if (unlikely(pkt == NULL))
+	if (unlikely(pkt == NULL)) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"Failed to allocate memory for mbuf.\n");
  		return NULL;
+	}
  
  	if (rte_pktmbuf_tailroom(pkt) >= data_len)
  		return pkt;
---

It's a hard failure that highlights some significant issues with
memory pool size or a mbuf leak.

We still have the message for subsequent chained mbufs, but not
for the first one.  Without this error message we could never
notice that something is wrong with our memory pool.  Only the
network traffic will stop flowing.
The message was very useful previously while catching the root
cause of the mbuf leak in OVS.

I could send a separate patch for this if needed.

Best regards, Ilya Maximets.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-16 13:32           ` Ilya Maximets
@ 2019-10-16 13:46             ` Maxime Coquelin
  2019-10-16 14:02               ` Flavio Leitner
  2019-10-16 14:05               ` Ilya Maximets
  0 siblings, 2 replies; 32+ messages in thread
From: Maxime Coquelin @ 2019-10-16 13:46 UTC (permalink / raw)
  To: Ilya Maximets, Flavio Leitner, dev
  Cc: Shahaf Shuler, David Marchand, Tiwei Bie, Obrembski MichalX, Stokes Ian



On 10/16/19 3:32 PM, Ilya Maximets wrote:
> On 16.10.2019 13:13, Maxime Coquelin wrote:
>>
>>
>> On 10/15/19 8:59 PM, Flavio Leitner wrote:
>>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
>>> If the data fits into a buffer, then all data is copied and a
>>> single linear buffer is returned. Otherwise it allocates
>>> additional mbufs and chains them together to return a multiple
>>> segments mbuf.
>>>
>>> While that covers most use cases, it forces applications that
>>> need to work with larger data sizes to support multiple segments
>>> mbufs. The non-linear characteristic brings complexity and
>>> performance implications to the application.
>>>
>>> To resolve the issue, add support to attach external buffer
>>> to a pktmbuf and let the host provide during registration if
>>> attaching an external buffer to pktmbuf is supported and if
>>> only linear buffer are supported.
>>>
>>> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
>>> ---
>>>   doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>>>   lib/librte_vhost/rte_vhost.h        |   4 +
>>>   lib/librte_vhost/socket.c           |  22 ++++++
>>>   lib/librte_vhost/vhost.c            |  22 ++++++
>>>   lib/librte_vhost/vhost.h            |   4 +
>>>   lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
>>>   6 files changed, 182 insertions(+), 14 deletions(-)
>>>
>>> - Changelog:
>>>    v5:
>>>      - fixed to destroy mutex if incompatible flags
>>>    v4:
>>>      - allow to use pktmbuf if there is exact space
>>>      - removed log message if the buffer is too big
>>>      - fixed the length to include align padding
>>>      - free allocated buf if shinfo fails
>>>    v3:
>>>      - prevent the new features to be used with zero copy
>>>      - fixed sizeof() usage
>>>      - fixed log msg indentation
>>>      - removed/replaced asserts
>>>      - used the correct virt2iova function
>>>      - fixed the patch's title
>>>      - OvS PoC code:
>>>        https://github.com/fleitner/ovs/tree/rte_malloc-v3
>>>    v2:
>>>      - Used rte_malloc() instead of another mempool as suggested by
>>> Shahaf.
>>>      - Added the documentation section.
>>>      - Using driver registration to negotiate the features.
>>>      - OvS PoC code:
>>>       
>>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
>>>
>>
>> Applied to dpdk-next-virtio/master.
> 
> Thanks Maxime,
> 
> But can we return back the mbuf allocation failure message?

Good point, I missed it.

> I mean:
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 66f0c7206..f8af4e0b3 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev,
> struct rte_mempool *mp,
>  {
>      struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
>  
> -    if (unlikely(pkt == NULL))
> +    if (unlikely(pkt == NULL)) {
> +        RTE_LOG(ERR, VHOST_DATA,
> +            "Failed to allocate memory for mbuf.\n");
>          return NULL;
> +    }
>  
>      if (rte_pktmbuf_tailroom(pkt) >= data_len)
>          return pkt;
> ---
> 
> It's a hard failure that highlights some significant issues with
> memory pool size or a mbuf leak.

I agree, we need this error message.

> We still have the message for subsequent chained mbufs, but not
> for the first one.  Without this error message we could never
> notice that something is wrong with our memory pool.  Only the
> network traffic will stop flowing.
> The message was very useful previously while catching the root
> cause of the mbuf leak in OVS.
> 
> I could send a separate patch for this if needed.

We need a separate patch.
If you want to do it, please do! Otherwise I'll do it.

Thanks!
Maxime
> Best regards, Ilya Maximets.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-16 13:46             ` Maxime Coquelin
@ 2019-10-16 14:02               ` Flavio Leitner
  2019-10-16 14:08                 ` Ilya Maximets
  2019-10-16 14:05               ` Ilya Maximets
  1 sibling, 1 reply; 32+ messages in thread
From: Flavio Leitner @ 2019-10-16 14:02 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Ilya Maximets, dev, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On Wed, 16 Oct 2019 15:46:15 +0200
Maxime Coquelin <maxime.coquelin@redhat.com> wrote:

> On 10/16/19 3:32 PM, Ilya Maximets wrote:
> > On 16.10.2019 13:13, Maxime Coquelin wrote:  
> >>
> >>
> >> On 10/15/19 8:59 PM, Flavio Leitner wrote:  
> >>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> >>> If the data fits into a buffer, then all data is copied and a
> >>> single linear buffer is returned. Otherwise it allocates
> >>> additional mbufs and chains them together to return a multiple
> >>> segments mbuf.
> >>>
> >>> While that covers most use cases, it forces applications that
> >>> need to work with larger data sizes to support multiple segments
> >>> mbufs. The non-linear characteristic brings complexity and
> >>> performance implications to the application.
> >>>
> >>> To resolve the issue, add support to attach external buffer
> >>> to a pktmbuf and let the host provide during registration if
> >>> attaching an external buffer to pktmbuf is supported and if
> >>> only linear buffer are supported.
> >>>
> >>> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> >>> ---
> >>>   doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
> >>>   lib/librte_vhost/rte_vhost.h        |   4 +
> >>>   lib/librte_vhost/socket.c           |  22 ++++++
> >>>   lib/librte_vhost/vhost.c            |  22 ++++++
> >>>   lib/librte_vhost/vhost.h            |   4 +
> >>>   lib/librte_vhost/virtio_net.c       | 109
> >>> ++++++++++++++++++++++++---- 6 files changed, 182 insertions(+),
> >>> 14 deletions(-)
> >>>
> >>> - Changelog:
> >>>    v5:
> >>>      - fixed to destroy mutex if incompatible flags
> >>>    v4:
> >>>      - allow to use pktmbuf if there is exact space
> >>>      - removed log message if the buffer is too big
> >>>      - fixed the length to include align padding
> >>>      - free allocated buf if shinfo fails
> >>>    v3:
> >>>      - prevent the new features to be used with zero copy
> >>>      - fixed sizeof() usage
> >>>      - fixed log msg indentation
> >>>      - removed/replaced asserts
> >>>      - used the correct virt2iova function
> >>>      - fixed the patch's title
> >>>      - OvS PoC code:
> >>>        https://github.com/fleitner/ovs/tree/rte_malloc-v3
> >>>    v2:
> >>>      - Used rte_malloc() instead of another mempool as suggested
> >>> by Shahaf.
> >>>      - Added the documentation section.
> >>>      - Using driver registration to negotiate the features.
> >>>      - OvS PoC code:
> >>>       
> >>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> >>>  
> >>
> >> Applied to dpdk-next-virtio/master.  
> > 
> > Thanks Maxime,
> > 
> > But can we return back the mbuf allocation failure message?  
> 
> Good point, I missed it.
> 
> > I mean:
> > 
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 66f0c7206..f8af4e0b3 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> > *dev, struct rte_mempool *mp,
> >  {
> >      struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> >  
> > -    if (unlikely(pkt == NULL))
> > +    if (unlikely(pkt == NULL)) {
> > +        RTE_LOG(ERR, VHOST_DATA,
> > +            "Failed to allocate memory for mbuf.\n");
> >          return NULL;
> > +    }
> >  
> >      if (rte_pktmbuf_tailroom(pkt) >= data_len)
> >          return pkt;
> > ---
> > 
> > It's a hard failure that highlights some significant issues with
> > memory pool size or a mbuf leak.  
> 
> I agree, we need this error message.

If it runs out of memory and there are many packets coming, then it
will flood the logs. Maybe we could use something to report in a more
moderated way, for example:

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index da69ab1db..b1572b3a4 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1354,8 +1354,14 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
 {
 	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
 
-	if (unlikely(pkt == NULL))
+	if (unlikely(pkt == NULL)) {
+		static int rate_lim = 0;
+
+		if (rate_lim++ % 1000 == 0)
+			RTE_LOG...
+
 		return NULL;
+	}
 
 	if (rte_pktmbuf_tailroom(pkt) >= data_len)
 		return pkt;


Or track this at mempool level and keep stats of failed requests and
report that in OvS. That would cover other points too.

fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-16 13:46             ` Maxime Coquelin
  2019-10-16 14:02               ` Flavio Leitner
@ 2019-10-16 14:05               ` Ilya Maximets
  1 sibling, 0 replies; 32+ messages in thread
From: Ilya Maximets @ 2019-10-16 14:05 UTC (permalink / raw)
  To: Maxime Coquelin, Ilya Maximets, Flavio Leitner, dev
  Cc: Shahaf Shuler, David Marchand, Tiwei Bie, Obrembski MichalX, Stokes Ian

On 16.10.2019 15:46, Maxime Coquelin wrote:
> 
> 
> On 10/16/19 3:32 PM, Ilya Maximets wrote:
>> On 16.10.2019 13:13, Maxime Coquelin wrote:
>>>
>>>
>>> On 10/15/19 8:59 PM, Flavio Leitner wrote:
>>>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
>>>> If the data fits into a buffer, then all data is copied and a
>>>> single linear buffer is returned. Otherwise it allocates
>>>> additional mbufs and chains them together to return a multiple
>>>> segments mbuf.
>>>>
>>>> While that covers most use cases, it forces applications that
>>>> need to work with larger data sizes to support multiple segments
>>>> mbufs. The non-linear characteristic brings complexity and
>>>> performance implications to the application.
>>>>
>>>> To resolve the issue, add support to attach external buffer
>>>> to a pktmbuf and let the host provide during registration if
>>>> attaching an external buffer to pktmbuf is supported and if
>>>> only linear buffer are supported.
>>>>
>>>> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
>>>> ---
>>>>    doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>>>>    lib/librte_vhost/rte_vhost.h        |   4 +
>>>>    lib/librte_vhost/socket.c           |  22 ++++++
>>>>    lib/librte_vhost/vhost.c            |  22 ++++++
>>>>    lib/librte_vhost/vhost.h            |   4 +
>>>>    lib/librte_vhost/virtio_net.c       | 109 ++++++++++++++++++++++++----
>>>>    6 files changed, 182 insertions(+), 14 deletions(-)
>>>>
>>>> - Changelog:
>>>>     v5:
>>>>       - fixed to destroy mutex if incompatible flags
>>>>     v4:
>>>>       - allow to use pktmbuf if there is exact space
>>>>       - removed log message if the buffer is too big
>>>>       - fixed the length to include align padding
>>>>       - free allocated buf if shinfo fails
>>>>     v3:
>>>>       - prevent the new features to be used with zero copy
>>>>       - fixed sizeof() usage
>>>>       - fixed log msg indentation
>>>>       - removed/replaced asserts
>>>>       - used the correct virt2iova function
>>>>       - fixed the patch's title
>>>>       - OvS PoC code:
>>>>         https://github.com/fleitner/ovs/tree/rte_malloc-v3
>>>>     v2:
>>>>       - Used rte_malloc() instead of another mempool as suggested by
>>>> Shahaf.
>>>>       - Added the documentation section.
>>>>       - Using driver registration to negotiate the features.
>>>>       - OvS PoC code:
>>>>        
>>>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
>>>>
>>>
>>> Applied to dpdk-next-virtio/master.
>>
>> Thanks Maxime,
>>
>> But can we return back the mbuf allocation failure message?
> 
> Good point, I missed it.
> 
>> I mean:
>>
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 66f0c7206..f8af4e0b3 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev,
>> struct rte_mempool *mp,
>>   {
>>       struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
>>   
>> -    if (unlikely(pkt == NULL))
>> +    if (unlikely(pkt == NULL)) {
>> +        RTE_LOG(ERR, VHOST_DATA,
>> +            "Failed to allocate memory for mbuf.\n");
>>           return NULL;
>> +    }
>>   
>>       if (rte_pktmbuf_tailroom(pkt) >= data_len)
>>           return pkt;
>> ---
>>
>> It's a hard failure that highlights some significant issues with
>> memory pool size or a mbuf leak.
> 
> I agree, we need this error message.
> 
>> We still have the message for subsequent chained mbufs, but not
>> for the first one.  Without this error message we could never
>> notice that something is wrong with our memory pool.  Only the
>> network traffic will stop flowing.
>> The message was very useful previously while catching the root
>> cause of the mbuf leak in OVS.
>>
>> I could send a separate patch for this if needed.
> 
> We need a separate patch.
> If you want to do it, please do! Otherwise I'll do it.

OK. I'll send a patch soon.

Best regards, Ilya Maximets.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-16 14:02               ` Flavio Leitner
@ 2019-10-16 14:08                 ` Ilya Maximets
  2019-10-16 14:14                   ` Flavio Leitner
  0 siblings, 1 reply; 32+ messages in thread
From: Ilya Maximets @ 2019-10-16 14:08 UTC (permalink / raw)
  To: Flavio Leitner, Maxime Coquelin
  Cc: Ilya Maximets, dev, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On 16.10.2019 16:02, Flavio Leitner wrote:
> On Wed, 16 Oct 2019 15:46:15 +0200
> Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> 
>> On 10/16/19 3:32 PM, Ilya Maximets wrote:
>>> On 16.10.2019 13:13, Maxime Coquelin wrote:
>>>>
>>>>
>>>> On 10/15/19 8:59 PM, Flavio Leitner wrote:
>>>>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
>>>>> If the data fits into a buffer, then all data is copied and a
>>>>> single linear buffer is returned. Otherwise it allocates
>>>>> additional mbufs and chains them together to return a multiple
>>>>> segments mbuf.
>>>>>
>>>>> While that covers most use cases, it forces applications that
>>>>> need to work with larger data sizes to support multiple segments
>>>>> mbufs. The non-linear characteristic brings complexity and
>>>>> performance implications to the application.
>>>>>
>>>>> To resolve the issue, add support to attach external buffer
>>>>> to a pktmbuf and let the host provide during registration if
>>>>> attaching an external buffer to pktmbuf is supported and if
>>>>> only linear buffer are supported.
>>>>>
>>>>> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
>>>>> ---
>>>>>    doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
>>>>>    lib/librte_vhost/rte_vhost.h        |   4 +
>>>>>    lib/librte_vhost/socket.c           |  22 ++++++
>>>>>    lib/librte_vhost/vhost.c            |  22 ++++++
>>>>>    lib/librte_vhost/vhost.h            |   4 +
>>>>>    lib/librte_vhost/virtio_net.c       | 109
>>>>> ++++++++++++++++++++++++---- 6 files changed, 182 insertions(+),
>>>>> 14 deletions(-)
>>>>>
>>>>> - Changelog:
>>>>>     v5:
>>>>>       - fixed to destroy mutex if incompatible flags
>>>>>     v4:
>>>>>       - allow to use pktmbuf if there is exact space
>>>>>       - removed log message if the buffer is too big
>>>>>       - fixed the length to include align padding
>>>>>       - free allocated buf if shinfo fails
>>>>>     v3:
>>>>>       - prevent the new features to be used with zero copy
>>>>>       - fixed sizeof() usage
>>>>>       - fixed log msg indentation
>>>>>       - removed/replaced asserts
>>>>>       - used the correct virt2iova function
>>>>>       - fixed the patch's title
>>>>>       - OvS PoC code:
>>>>>         https://github.com/fleitner/ovs/tree/rte_malloc-v3
>>>>>     v2:
>>>>>       - Used rte_malloc() instead of another mempool as suggested
>>>>> by Shahaf.
>>>>>       - Added the documentation section.
>>>>>       - Using driver registration to negotiate the features.
>>>>>       - OvS PoC code:
>>>>>        
>>>>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
>>>>>   
>>>>
>>>> Applied to dpdk-next-virtio/master.
>>>
>>> Thanks Maxime,
>>>
>>> But can we return back the mbuf allocation failure message?
>>
>> Good point, I missed it.
>>
>>> I mean:
>>>
>>> diff --git a/lib/librte_vhost/virtio_net.c
>>> b/lib/librte_vhost/virtio_net.c index 66f0c7206..f8af4e0b3 100644
>>> --- a/lib/librte_vhost/virtio_net.c
>>> +++ b/lib/librte_vhost/virtio_net.c
>>> @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
>>> *dev, struct rte_mempool *mp,
>>>   {
>>>       struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
>>>   
>>> -    if (unlikely(pkt == NULL))
>>> +    if (unlikely(pkt == NULL)) {
>>> +        RTE_LOG(ERR, VHOST_DATA,
>>> +            "Failed to allocate memory for mbuf.\n");
>>>           return NULL;
>>> +    }
>>>   
>>>       if (rte_pktmbuf_tailroom(pkt) >= data_len)
>>>           return pkt;
>>> ---
>>>
>>> It's a hard failure that highlights some significant issues with
>>> memory pool size or a mbuf leak.
>>
>> I agree, we need this error message.
> 
> If it runs out of memory and there are many packets coming, then it
> will flood the logs. Maybe we could use something to report in a more
> moderated way, for example:
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index da69ab1db..b1572b3a4 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1354,8 +1354,14 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
>   {
>   	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
>   
> -	if (unlikely(pkt == NULL))
> +	if (unlikely(pkt == NULL)) {
> +		static int rate_lim = 0;
> +
> +		if (rate_lim++ % 1000 == 0)
> +			RTE_LOG...
> +
>   		return NULL;
> +	}
>   
>   	if (rte_pktmbuf_tailroom(pkt) >= data_len)
>   		return pkt;
> 
> 
> Or track this at mempool level and keep stats of failed requests and
> report that in OvS. That would cover other points too.

OVS is already rete-limiting DPDK logs.  Basically, I introduced this
functionality to limit exactly this message.
See: 9fd38f6867df ("netdev-dpdk: Limit rate of DPDK logs.")

Best regards, Ilya Maximets.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-16 14:08                 ` Ilya Maximets
@ 2019-10-16 14:14                   ` Flavio Leitner
  0 siblings, 0 replies; 32+ messages in thread
From: Flavio Leitner @ 2019-10-16 14:14 UTC (permalink / raw)
  To: Ilya Maximets
  Cc: Maxime Coquelin, dev, Shahaf Shuler, David Marchand, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On Wed, 16 Oct 2019 16:08:54 +0200
Ilya Maximets <i.maximets@ovn.org> wrote:

> On 16.10.2019 16:02, Flavio Leitner wrote:
> > On Wed, 16 Oct 2019 15:46:15 +0200
> > Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> >   
> >> On 10/16/19 3:32 PM, Ilya Maximets wrote:  
> >>> On 16.10.2019 13:13, Maxime Coquelin wrote:  
> >>>>
> >>>>
> >>>> On 10/15/19 8:59 PM, Flavio Leitner wrote:  
> >>>>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> >>>>> If the data fits into a buffer, then all data is copied and a
> >>>>> single linear buffer is returned. Otherwise it allocates
> >>>>> additional mbufs and chains them together to return a multiple
> >>>>> segments mbuf.
> >>>>>
> >>>>> While that covers most use cases, it forces applications that
> >>>>> need to work with larger data sizes to support multiple segments
> >>>>> mbufs. The non-linear characteristic brings complexity and
> >>>>> performance implications to the application.
> >>>>>
> >>>>> To resolve the issue, add support to attach external buffer
> >>>>> to a pktmbuf and let the host provide during registration if
> >>>>> attaching an external buffer to pktmbuf is supported and if
> >>>>> only linear buffer are supported.
> >>>>>
> >>>>> Signed-off-by: Flavio Leitner <fbl@sysclose.org>
> >>>>> ---
> >>>>>    doc/guides/prog_guide/vhost_lib.rst |  35 +++++++++
> >>>>>    lib/librte_vhost/rte_vhost.h        |   4 +
> >>>>>    lib/librte_vhost/socket.c           |  22 ++++++
> >>>>>    lib/librte_vhost/vhost.c            |  22 ++++++
> >>>>>    lib/librte_vhost/vhost.h            |   4 +
> >>>>>    lib/librte_vhost/virtio_net.c       | 109
> >>>>> ++++++++++++++++++++++++---- 6 files changed, 182 insertions(+),
> >>>>> 14 deletions(-)
> >>>>>
> >>>>> - Changelog:
> >>>>>     v5:
> >>>>>       - fixed to destroy mutex if incompatible flags
> >>>>>     v4:
> >>>>>       - allow to use pktmbuf if there is exact space
> >>>>>       - removed log message if the buffer is too big
> >>>>>       - fixed the length to include align padding
> >>>>>       - free allocated buf if shinfo fails
> >>>>>     v3:
> >>>>>       - prevent the new features to be used with zero copy
> >>>>>       - fixed sizeof() usage
> >>>>>       - fixed log msg indentation
> >>>>>       - removed/replaced asserts
> >>>>>       - used the correct virt2iova function
> >>>>>       - fixed the patch's title
> >>>>>       - OvS PoC code:
> >>>>>         https://github.com/fleitner/ovs/tree/rte_malloc-v3
> >>>>>     v2:
> >>>>>       - Used rte_malloc() instead of another mempool as
> >>>>> suggested by Shahaf.
> >>>>>       - Added the documentation section.
> >>>>>       - Using driver registration to negotiate the features.
> >>>>>       - OvS PoC code:
> >>>>>        
> >>>>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> >>>>>     
> >>>>
> >>>> Applied to dpdk-next-virtio/master.  
> >>>
> >>> Thanks Maxime,
> >>>
> >>> But can we return back the mbuf allocation failure message?  
> >>
> >> Good point, I missed it.
> >>  
> >>> I mean:
> >>>
> >>> diff --git a/lib/librte_vhost/virtio_net.c
> >>> b/lib/librte_vhost/virtio_net.c index 66f0c7206..f8af4e0b3 100644
> >>> --- a/lib/librte_vhost/virtio_net.c
> >>> +++ b/lib/librte_vhost/virtio_net.c
> >>> @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> >>> *dev, struct rte_mempool *mp,
> >>>   {
> >>>       struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> >>>   
> >>> -    if (unlikely(pkt == NULL))
> >>> +    if (unlikely(pkt == NULL)) {
> >>> +        RTE_LOG(ERR, VHOST_DATA,
> >>> +            "Failed to allocate memory for mbuf.\n");
> >>>           return NULL;
> >>> +    }
> >>>   
> >>>       if (rte_pktmbuf_tailroom(pkt) >= data_len)
> >>>           return pkt;
> >>> ---
> >>>
> >>> It's a hard failure that highlights some significant issues with
> >>> memory pool size or a mbuf leak.  
> >>
> >> I agree, we need this error message.  
> > 
> > If it runs out of memory and there are many packets coming, then it
> > will flood the logs. Maybe we could use something to report in a
> > more moderated way, for example:
> > 
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index da69ab1db..b1572b3a4 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1354,8 +1354,14 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> > *dev, struct rte_mempool *mp, {
> >   	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> >   
> > -	if (unlikely(pkt == NULL))
> > +	if (unlikely(pkt == NULL)) {
> > +		static int rate_lim = 0;
> > +
> > +		if (rate_lim++ % 1000 == 0)
> > +			RTE_LOG...
> > +
> >   		return NULL;
> > +	}
> >   
> >   	if (rte_pktmbuf_tailroom(pkt) >= data_len)
> >   		return pkt;
> > 
> > 
> > Or track this at mempool level and keep stats of failed requests and
> > report that in OvS. That would cover other points too.  
> 
> OVS is already rete-limiting DPDK logs.  Basically, I introduced this
> functionality to limit exactly this message.
> See: 9fd38f6867df ("netdev-dpdk: Limit rate of DPDK logs.")

Thanks, I missed that commit.
fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
  2019-10-16 10:02         ` Maxime Coquelin
  2019-10-16 11:13         ` Maxime Coquelin
@ 2019-10-29  9:02         ` David Marchand
  2019-10-29 12:21           ` Flavio Leitner
  2 siblings, 1 reply; 32+ messages in thread
From: David Marchand @ 2019-10-29  9:02 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: dev, Ilya Maximets, Maxime Coquelin, Shahaf Shuler, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On Tue, Oct 15, 2019 at 9:00 PM Flavio Leitner <fbl@sysclose.org> wrote:
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5b85b832d..da69ab1db 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1289,6 +1289,93 @@ get_zmbuf(struct vhost_virtqueue *vq)
>         return NULL;
>  }
>
> +static void
> +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
> +{
> +       rte_free(opaque);
> +}
> +
> +static int
> +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
> +{
> +       struct rte_mbuf_ext_shared_info *shinfo = NULL;
> +       uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
> +       uint16_t buf_len;
> +       rte_iova_t iova;
> +       void *buf;
> +
> +       /* Try to use pkt buffer to store shinfo to reduce the amount of memory
> +        * required, otherwise store shinfo in the new buffer.
> +        */
> +       if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
> +               shinfo = rte_pktmbuf_mtod(pkt,
> +                                         struct rte_mbuf_ext_shared_info *);
> +       else {
> +               total_len += sizeof(*shinfo) + sizeof(uintptr_t);
> +               total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
> +       }
> +
> +       if (unlikely(total_len > UINT16_MAX))
> +               return -ENOSPC;
> +
> +       buf_len = total_len;
> +       buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);

Using rte_malloc() means that the allocation can end up on any numa node.
This external buffer might end up on a different node than the mbuf
(which resides on mp->socket_id node).


-- 
David Marchand

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-29  9:02         ` David Marchand
@ 2019-10-29 12:21           ` Flavio Leitner
  2019-10-29 16:19             ` David Marchand
  0 siblings, 1 reply; 32+ messages in thread
From: Flavio Leitner @ 2019-10-29 12:21 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Ilya Maximets, Maxime Coquelin, Shahaf Shuler, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On Tue, 29 Oct 2019 10:02:57 +0100
David Marchand <david.marchand@redhat.com> wrote:

> On Tue, Oct 15, 2019 at 9:00 PM Flavio Leitner <fbl@sysclose.org>
> wrote:
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 5b85b832d..da69ab1db 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1289,6 +1289,93 @@ get_zmbuf(struct vhost_virtqueue *vq)
> >         return NULL;
> >  }
> >
> > +static void
> > +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
> > +{
> > +       rte_free(opaque);
> > +}
> > +
> > +static int
> > +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
> > +{
> > +       struct rte_mbuf_ext_shared_info *shinfo = NULL;
> > +       uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
> > +       uint16_t buf_len;
> > +       rte_iova_t iova;
> > +       void *buf;
> > +
> > +       /* Try to use pkt buffer to store shinfo to reduce the
> > amount of memory
> > +        * required, otherwise store shinfo in the new buffer.
> > +        */
> > +       if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
> > +               shinfo = rte_pktmbuf_mtod(pkt,
> > +                                         struct
> > rte_mbuf_ext_shared_info *);
> > +       else {
> > +               total_len += sizeof(*shinfo) + sizeof(uintptr_t);
> > +               total_len = RTE_ALIGN_CEIL(total_len,
> > sizeof(uintptr_t));
> > +       }
> > +
> > +       if (unlikely(total_len > UINT16_MAX))
> > +               return -ENOSPC;
> > +
> > +       buf_len = total_len;
> > +       buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);  
> 
> Using rte_malloc() means that the allocation can end up on any numa
> node. This external buffer might end up on a different node than the
> mbuf (which resides on mp->socket_id node).

Only if there is no memory in the local socket. It seems better than a
failure to me.

fbl

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers
  2019-10-29 12:21           ` Flavio Leitner
@ 2019-10-29 16:19             ` David Marchand
  0 siblings, 0 replies; 32+ messages in thread
From: David Marchand @ 2019-10-29 16:19 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: dev, Ilya Maximets, Maxime Coquelin, Shahaf Shuler, Tiwei Bie,
	Obrembski MichalX, Stokes Ian

On Tue, Oct 29, 2019 at 1:21 PM Flavio Leitner <fbl@sysclose.org> wrote:
> On Tue, 29 Oct 2019 10:02:57 +0100
> David Marchand <david.marchand@redhat.com> wrote:
> > Using rte_malloc() means that the allocation can end up on any numa
> > node. This external buffer might end up on a different node than the
> > mbuf (which resides on mp->socket_id node).
>
> Only if there is no memory in the local socket. It seems better than a
> failure to me.

This won't fail, but you don't know how it will impact the application.

We are moving from a model where the vhost library used a mempool with
a fixed number of buffers on a known socket to a model where we can't
guarantee how much memory vhost will eat, and where the allocations
happen.
I am just scared we will get bitten by random allocation failures on a
system that was "used to run fine".


--
David Marchand

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2019-10-29 16:19 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-01 22:19 [dpdk-dev] [PATCH] vhost: add support to large linear mbufs Flavio Leitner
2019-10-01 23:10 ` Flavio Leitner
2019-10-02  4:45 ` Shahaf Shuler
2019-10-02  8:04   ` David Marchand
2019-10-02  9:00     ` Shahaf Shuler
2019-10-02 12:58       ` Flavio Leitner
2019-10-02 17:50         ` Shahaf Shuler
2019-10-02 18:15           ` Flavio Leitner
2019-10-03 16:57             ` Ilya Maximets
2019-10-03 21:25               ` Flavio Leitner
2019-10-02  7:51 ` Maxime Coquelin
2019-10-04 20:10 ` [dpdk-dev] [PATCH v2] vhost: add support for large buffers Flavio Leitner
2019-10-06  4:47   ` Shahaf Shuler
2019-10-10  5:12   ` Tiwei Bie
2019-10-10 12:12     ` Flavio Leitner
2019-10-11 17:09   ` [dpdk-dev] [PATCH v3] " Flavio Leitner
2019-10-14  2:44     ` Tiwei Bie
2019-10-15 16:17     ` [dpdk-dev] [PATCH v4] " Flavio Leitner
2019-10-15 17:41       ` Ilya Maximets
2019-10-15 18:44         ` Flavio Leitner
2019-10-15 18:59       ` [dpdk-dev] [PATCH v5] " Flavio Leitner
2019-10-16 10:02         ` Maxime Coquelin
2019-10-16 11:13         ` Maxime Coquelin
2019-10-16 13:32           ` Ilya Maximets
2019-10-16 13:46             ` Maxime Coquelin
2019-10-16 14:02               ` Flavio Leitner
2019-10-16 14:08                 ` Ilya Maximets
2019-10-16 14:14                   ` Flavio Leitner
2019-10-16 14:05               ` Ilya Maximets
2019-10-29  9:02         ` David Marchand
2019-10-29 12:21           ` Flavio Leitner
2019-10-29 16:19             ` David Marchand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).