[dpdk-stable] [PATCH v16.11 LTS] vhost: improve dirty pages logging performance

patches for DPDK stable branches
 help / color / mirror / Atom feed

* [dpdk-stable] [PATCH v16.11 LTS] vhost: improve dirty pages logging performance
@ 2018-05-23  7:53 Maxime Coquelin
  2018-05-23  9:21 ` Luca Boccassi
  0 siblings, 1 reply; 2+ messages in thread
From: Maxime Coquelin @ 2018-05-23  7:53 UTC (permalink / raw)
  To: stable; +Cc: Maxime Coquelin

[ upstream commit c16915b8710911a75f0fbdb1aa5243f4cdfaf26a ]

This patch caches all dirty pages logging until the used ring index
is updated.

The goal of this optimization is to fix a performance regression
introduced when the vhost library started to use atomic operations
to set bits in the shared dirty log map. While the fix was valid
as previous implementation wasn't safe against concurrent accesses,
contention was induced.

With this patch, during migration, we have:
1. Less atomic operations as only a single atomic OR operation
per 32 or 64 (depending on CPU) pages.
2. Less atomic operations as during a burst, the same page will
be marked dirty only once.
3. Less write memory barriers.

Fixes: 897f13a1f726 ("vhost: make page logging atomic")
Cc: stable@dpdk.org

Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Tiwei Bie <tiwei.bie@intel.com>
---

Hi,

This backport had some conflicts, mainly because the dirty pages
logging functions moved from virtio-net.c to vhost.h after 16.11.

Main other change with upstream is the GCC_VERSION isn't defined,
so I decided to remove the optimization done for GCC >= 7.1 as it
is unlikely to be used with v16.11 LTS.

To test it, I had to revert 357f27736c79d as vhost is currently
broken on 16.11 branch (see separate mail).

 lib/librte_vhost/vhost.h      |  13 +++++
 lib/librte_vhost/virtio_net.c | 128 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 0dd665292..abc5a908f 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -52,6 +52,8 @@
 
 #define BUF_VECTOR_MAX 256
 
+#define VHOST_LOG_CACHE_NR 32
+
 /**
  * Structure contains buffer address, length and descriptor index
  * from vring to do scatter RX.
@@ -75,6 +77,14 @@ struct zcopy_mbuf {
 };
 TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
 
+/*
+ * Structure that contains the info for batched dirty logging.
+ */
+struct log_cache_entry {
+	uint32_t offset;
+	unsigned long val;
+};
+
 /**
  * Structure contains variables relevant to RX/TX virtqueues.
  */
@@ -110,6 +120,9 @@ struct vhost_virtqueue {
 
 	struct vring_used_elem  *shadow_used_ring;
 	uint16_t                shadow_used_idx;
+
+	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
+	uint16_t log_cache_nb_elem;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macros defined */
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5f0150827..2932bade1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -58,7 +58,7 @@
 static inline void __attribute__((always_inline))
 vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
 {
-	__sync_fetch_and_or_8(addr, (1U << nr));
+	__sync_fetch_and_or_1(addr, (1U << nr));
 }
 
 static inline void __attribute__((always_inline))
@@ -89,6 +89,93 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
 	}
 }
 
+static inline void __attribute__((always_inline))
+vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	unsigned long *log_base;
+	int i;
+
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+				!dev->log_base))
+		return;
+
+	log_base = (unsigned long *)(uintptr_t)dev->log_base;
+
+	/*
+	 * It is expected a write memory barrier has been issued
+	 * before this function is called.
+	 */
+
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
+		struct log_cache_entry *elem = vq->log_cache + i;
+
+		__sync_fetch_and_or(log_base + elem->offset, elem->val);
+	}
+
+	rte_smp_wmb();
+
+	vq->log_cache_nb_elem = 0;
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint64_t page)
+{
+	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
+	uint32_t offset = page / (sizeof(unsigned long) << 3);
+	int i;
+
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
+		struct log_cache_entry *elem = vq->log_cache + i;
+
+		if (elem->offset == offset) {
+			elem->val |= (1UL << bit_nr);
+			return;
+		}
+	}
+
+	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
+		/*
+		 * No more room for a new log cache entry,
+		 * so write the dirty log map directly.
+		 */
+		rte_smp_wmb();
+		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+
+		return;
+	}
+
+	vq->log_cache[i].offset = offset;
+	vq->log_cache[i].val = (1UL << bit_nr);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint64_t addr, uint64_t len)
+{
+	uint64_t page;
+
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+				!dev->log_base || !len))
+		return;
+
+	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+		return;
+
+	page = addr / VHOST_LOG_PAGE;
+	while (page * VHOST_LOG_PAGE < addr + len) {
+		vhost_log_cache_page(dev, vq, page);
+		page += 1;
+	}
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint64_t offset, uint64_t len)
+{
+	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
+}
+
 static inline void __attribute__((always_inline))
 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		     uint64_t offset, uint64_t len)
@@ -147,7 +234,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	rte_memcpy(&vq->used->ring[to],
 			&vq->shadow_used_ring[from],
 			size * sizeof(struct vring_used_elem));
-	vhost_log_used_vring(dev, vq,
+	vhost_log_cache_used_vring(dev, vq,
 			offsetof(struct vring_used, ring[to]),
 			size * sizeof(struct vring_used_elem));
 }
@@ -175,6 +262,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
 
 	rte_smp_wmb();
 
+	vhost_log_cache_sync(dev, vq);
+
 	*(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 		sizeof(vq->used->idx));
@@ -249,8 +338,9 @@ copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
 }
 
 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
-		  struct rte_mbuf *m, uint16_t desc_idx, uint32_t size)
+copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		struct vring_desc *descs, struct rte_mbuf *m,
+		uint16_t desc_idx, uint32_t size)
 {
 	uint32_t desc_avail, desc_offset;
 	uint32_t mbuf_avail, mbuf_offset;
@@ -305,7 +395,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 		}
 	}
 
-	vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
+	vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
 
 	desc_avail  = desc->len - dev->vhost_hlen;
 	if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
@@ -368,7 +458,8 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
 			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 			cpy_len);
-		vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
+		vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
+				cpy_len);
 		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
 			     cpy_len, 0);
 
@@ -434,7 +525,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 		vq->used->ring[used_idx].id = desc_indexes[i];
 		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
 					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
+		vhost_log_cache_used_vring(dev, vq,
 			offsetof(struct vring_used, ring[used_idx]),
 			sizeof(vq->used->ring[used_idx]));
 	}
@@ -474,11 +565,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 			sz = vq->size;
 		}
 
-		err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz);
+		err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
 		if (unlikely(err)) {
 			used_idx = (start_idx + i) & (vq->size - 1);
 			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
+			vhost_log_cache_used_vring(dev, vq,
 				offsetof(struct vring_used, ring[used_idx]),
 				sizeof(vq->used->ring[used_idx]));
 		}
@@ -492,6 +583,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 
 	rte_smp_wmb();
 
+	vhost_log_cache_sync(dev, vq);
+
 	*(volatile uint16_t *)&vq->used->idx += count;
 	vq->last_used_idx += count;
 	vhost_log_used_vring(dev, vq,
@@ -623,8 +716,9 @@ reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 }
 
 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec, uint16_t num_buffers)
+copy_mbuf_to_desc_mergeable(struct virtio_net *dev,
+			struct vhost_virtqueue *vq, struct rte_mbuf *m,
+			struct buf_vector *buf_vec, uint16_t num_buffers)
 {
 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
@@ -743,7 +837,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 					dst += len;
 				}
 			}
-			vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
+			vhost_log_cache_write(dev, vq, hdr_phys_addr,
+					dev->vhost_hlen);
 			PRINT_PACKET(dev, (uintptr_t)hdr_addr,
 				     dev->vhost_hlen, 0);
 
@@ -754,7 +849,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
 			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 			cpy_len);
-		vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
+		vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
+				cpy_len);
 		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
 			cpy_len, 0);
 
@@ -817,7 +913,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 			dev->vid, vq->last_avail_idx,
 			vq->last_avail_idx + num_buffers);
 
-		if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx],
+		if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
 						buf_vec, num_buffers) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
@@ -1237,7 +1333,7 @@ update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 {
 	vq->used->ring[used_idx].id  = desc_idx;
 	vq->used->ring[used_idx].len = 0;
-	vhost_log_used_vring(dev, vq,
+	vhost_log_cache_used_vring(dev, vq,
 			offsetof(struct vring_used, ring[used_idx]),
 			sizeof(vq->used->ring[used_idx]));
 }
@@ -1252,6 +1348,8 @@ update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	rte_smp_wmb();
 	rte_smp_rmb();
 
+	vhost_log_cache_sync(dev, vq);
+
 	vq->used->idx += count;
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));
-- 
2.14.3

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [dpdk-stable] [PATCH v16.11 LTS] vhost: improve dirty pages logging performance
  2018-05-23  7:53 [dpdk-stable] [PATCH v16.11 LTS] vhost: improve dirty pages logging performance Maxime Coquelin
@ 2018-05-23  9:21 ` Luca Boccassi
  0 siblings, 0 replies; 2+ messages in thread
From: Luca Boccassi @ 2018-05-23  9:21 UTC (permalink / raw)
  To: Maxime Coquelin, stable

On Wed, 2018-05-23 at 09:53 +0200, Maxime Coquelin wrote:
> [ upstream commit c16915b8710911a75f0fbdb1aa5243f4cdfaf26a ]
> 
> This patch caches all dirty pages logging until the used ring index
> is updated.
> 
> The goal of this optimization is to fix a performance regression
> introduced when the vhost library started to use atomic operations
> to set bits in the shared dirty log map. While the fix was valid
> as previous implementation wasn't safe against concurrent accesses,
> contention was induced.
> 
> With this patch, during migration, we have:
> 1. Less atomic operations as only a single atomic OR operation
> per 32 or 64 (depending on CPU) pages.
> 2. Less atomic operations as during a burst, the same page will
> be marked dirty only once.
> 3. Less write memory barriers.
> 
> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
> Cc: stable@dpdk.org
> 
> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> Reviewed-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
> 
> Hi,
> 
> This backport had some conflicts, mainly because the dirty pages
> logging functions moved from virtio-net.c to vhost.h after 16.11.
> 
> Main other change with upstream is the GCC_VERSION isn't defined,
> so I decided to remove the optimization done for GCC >= 7.1 as it
> is unlikely to be used with v16.11 LTS.
> 
> To test it, I had to revert 357f27736c79d as vhost is currently
> broken on 16.11 branch (see separate mail).
> 
>  lib/librte_vhost/vhost.h      |  13 +++++
>  lib/librte_vhost/virtio_net.c | 128
> +++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 126 insertions(+), 15 deletions(-)

Thanks, applied and pushed to 16.11.

-- 
Kind regards,
Luca Boccassi

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-05-23  9:21 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-23  7:53 [dpdk-stable] [PATCH v16.11 LTS] vhost: improve dirty pages logging performance Maxime Coquelin
2018-05-23  9:21 ` Luca Boccassi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).