DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance
@ 2018-05-15 17:30 Maxime Coquelin
  2018-05-16  6:21 ` Tiwei Bie
  0 siblings, 1 reply; 4+ messages in thread
From: Maxime Coquelin @ 2018-05-15 17:30 UTC (permalink / raw)
  To: dev, tiwei.bie, mst; +Cc: stable, Maxime Coquelin

This patch caches all dirty pages logging until the used ring index
is updated. These dirty pages won't be accessed by the guest as
long as the host doesn't give them back to it by updating the
index.

The goal of this optimization is to fix a performance regression
introduced when the vhost library started to use atomic operations
to set bits in the shared dirty log map. While the fix was valid
as previous implementation wasn't safe against concurent accesses,
contention was induced.

With this patch, during migration, we have:
1. Less atomic operations as only a single atomic OR operation
per 32 or 64 (depending on CPU)  pages.
2. Less atomic operations as during a burst, the same page will
be marked dirty only once.
3. Less write memory barriers.

Fixes: 897f13a1f726 ("vhost: make page logging atomic")

Cc: stable@dpdk.org

Cc: Tiwei Bie <tiwei.bie@intel.com>
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/librte_vhost/vhost.h      | 113 ++++++++++++++++++++++++++++++++++++++++++
 lib/librte_vhost/virtio_net.c |  29 +++++++----
 2 files changed, 132 insertions(+), 10 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 891978131..8f6a41d7e 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -36,6 +36,8 @@
 
 #define BUF_VECTOR_MAX 256
 
+#define VHOST_LOG_CACHE_NR 32
+
 /**
  * Structure contains buffer address, length and descriptor index
  * from vring to do scatter RX.
@@ -69,6 +71,14 @@ struct batch_copy_elem {
 	uint64_t log_addr;
 };
 
+/*
+ * Structure that contains the info for batched dirty logging.
+ */
+struct log_cache_entry {
+	uint32_t offset;
+	unsigned long val;
+};
+
 /**
  * Structure contains variables relevant to RX/TX virtqueues.
  */
@@ -112,6 +122,9 @@ struct vhost_virtqueue {
 	struct batch_copy_elem	*batch_copy_elems;
 	uint16_t		batch_copy_nb_elems;
 
+	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
+	uint16_t log_cache_nb_elem;
+
 	rte_rwlock_t	iotlb_lock;
 	rte_rwlock_t	iotlb_pending_lock;
 	struct rte_mempool *iotlb_pool;
@@ -309,7 +322,15 @@ struct virtio_net {
 static __rte_always_inline void
 vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
 {
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
+		/*
+		 * __sync_ built-ins are deprecated, but __atomic_ ones
+		 * are sub-optimized in older GCC versions.
+		 */
 	__sync_fetch_and_or_8(addr, (1U << nr));
+#else
+	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
+#endif
 }
 
 static __rte_always_inline void
@@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
 	}
 }
 
+static __rte_always_inline void
+vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	unsigned long *log_base;
+	int i;
+
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+		   !dev->log_base))
+		return;
+
+	log_base = (unsigned long *)(uintptr_t)dev->log_base;
+
+	/* To make sure guest memory updates are committed before logging */
+	rte_smp_wmb();
+
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
+		struct log_cache_entry *elem = vq->log_cache + i;
+
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
+		/*
+		 * '__sync' builtins are deprecated, but '__atomic' ones
+		 * are sub-optimized in older GCC versions.
+		 */
+		__sync_fetch_and_or(log_base + elem->offset, elem->val);
+#else
+		__atomic_fetch_or(log_base + elem->offset, elem->val,
+				__ATOMIC_RELAXED);
+#endif
+	}
+
+	vq->log_cache_nb_elem = 0;
+}
+
+static __rte_always_inline void
+vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
+					 uint64_t page)
+{
+	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
+	uint32_t offset = page / (sizeof(unsigned long) << 3);
+	int i;
+
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
+		struct log_cache_entry *elem = vq->log_cache + i;
+
+		if (elem->offset == offset) {
+			elem->val |= (1U << bit_nr);
+			return;
+		}
+	}
+
+	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
+		/*
+		 * No more room for a new log cache entry,
+		 * so write the dirty log map directly.
+		 */
+		rte_smp_wmb();
+		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+
+		return;
+	}
+
+	vq->log_cache[i].offset = offset;
+	vq->log_cache[i].val = (1U << bit_nr);
+}
+
+static __rte_always_inline void
+vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
+					  uint64_t addr, uint64_t len)
+{
+	uint64_t page;
+
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+		   !dev->log_base || !len))
+		return;
+
+	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+		return;
+
+	page = addr / VHOST_LOG_PAGE;
+	while (page * VHOST_LOG_PAGE < addr + len) {
+		vhost_log_cache_page(dev, vq, page);
+		page += 1;
+	}
+}
+
+static __rte_always_inline void
+vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		     uint64_t offset, uint64_t len)
+{
+	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
+}
+
 static __rte_always_inline void
 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		     uint64_t offset, uint64_t len)
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index eed6b0227..76ec5f089 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	rte_memcpy(&vq->used->ring[to],
 			&vq->shadow_used_ring[from],
 			size * sizeof(struct vring_used_elem));
-	vhost_log_used_vring(dev, vq,
+	vhost_log_cache_used_vring(dev, vq,
 			offsetof(struct vring_used, ring[to]),
 			size * sizeof(struct vring_used_elem));
 }
@@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
 
 	rte_smp_wmb();
 
+	vhost_log_cache_sync(dev, vq);
+
 	*(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 		sizeof(vq->used->idx));
@@ -130,7 +132,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 
 	for (i = 0; i < count; i++) {
 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
-		vhost_log_write(dev, elem[i].log_addr, elem[i].len);
+		vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
 	}
 }
@@ -251,7 +253,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		virtio_enqueue_offload(m,
 				(struct virtio_net_hdr *)(uintptr_t)desc_addr);
 		PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-		vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
+		vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
 	} else {
 		struct virtio_net_hdr vnet_hdr;
 		uint64_t remain = dev->vhost_hlen;
@@ -274,7 +276,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					(void *)(uintptr_t)src, len);
 
 			PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
-			vhost_log_write(dev, guest_addr, len);
+			vhost_log_cache_write(dev, vq, guest_addr, len);
 			remain -= len;
 			guest_addr += len;
 			src += len;
@@ -355,7 +357,8 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 							desc_offset)),
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 				cpy_len);
-			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
+			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
+					cpy_len);
 			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
 				     cpy_len, 0);
 		} else {
@@ -444,7 +447,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 		vq->used->ring[used_idx].id = desc_indexes[i];
 		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
 					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
+		vhost_log_cache_used_vring(dev, vq,
 			offsetof(struct vring_used, ring[used_idx]),
 			sizeof(vq->used->ring[used_idx]));
 	}
@@ -504,6 +507,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 
 	rte_smp_wmb();
 
+	vhost_log_cache_sync(dev, vq);
+
 	*(volatile uint16_t *)&vq->used->idx += count;
 	vq->last_used_idx += count;
 	vhost_log_used_vring(dev, vq,
@@ -767,7 +772,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 					PRINT_PACKET(dev, (uintptr_t)dst,
 							(uint32_t)len, 0);
-					vhost_log_write(dev, guest_addr, len);
+					vhost_log_cache_write(dev, vq,
+							guest_addr, len);
 
 					remain -= len;
 					guest_addr += len;
@@ -776,7 +782,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			} else {
 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
 						dev->vhost_hlen, 0);
-				vhost_log_write(dev, hdr_phys_addr,
+				vhost_log_cache_write(dev, vq, hdr_phys_addr,
 						dev->vhost_hlen);
 			}
 
@@ -790,7 +796,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 							desc_offset)),
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 				cpy_len);
-			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
+			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
+					cpy_len);
 			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
 				cpy_len, 0);
 		} else {
@@ -1320,7 +1327,7 @@ update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 {
 	vq->used->ring[used_idx].id  = desc_idx;
 	vq->used->ring[used_idx].len = 0;
-	vhost_log_used_vring(dev, vq,
+	vhost_log_cache_used_vring(dev, vq,
 			offsetof(struct vring_used, ring[used_idx]),
 			sizeof(vq->used->ring[used_idx]));
 }
@@ -1335,6 +1342,8 @@ update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	rte_smp_wmb();
 	rte_smp_rmb();
 
+	vhost_log_cache_sync(dev, vq);
+
 	vq->used->idx += count;
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));
-- 
2.14.3

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance
  2018-05-15 17:30 [dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance Maxime Coquelin
@ 2018-05-16  6:21 ` Tiwei Bie
  2018-05-16  7:49   ` Maxime Coquelin
  0 siblings, 1 reply; 4+ messages in thread
From: Tiwei Bie @ 2018-05-16  6:21 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: dev, mst, stable

On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
> This patch caches all dirty pages logging until the used ring index
> is updated. These dirty pages won't be accessed by the guest as
> long as the host doesn't give them back to it by updating the
> index.
> 
> The goal of this optimization is to fix a performance regression
> introduced when the vhost library started to use atomic operations
> to set bits in the shared dirty log map. While the fix was valid
> as previous implementation wasn't safe against concurent accesses,

Typo: concurent

> contention was induced.
> 
> With this patch, during migration, we have:
> 1. Less atomic operations as only a single atomic OR operation
> per 32 or 64 (depending on CPU)  pages.
> 2. Less atomic operations as during a burst, the same page will
> be marked dirty only once.
> 3. Less write memory barriers.
> 
> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
> 
> Cc: stable@dpdk.org
> 
> Cc: Tiwei Bie <tiwei.bie@intel.com>
> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>

This is a nice approach! Thanks for the work!

> ---
>  lib/librte_vhost/vhost.h      | 113 ++++++++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/virtio_net.c |  29 +++++++----
>  2 files changed, 132 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 891978131..8f6a41d7e 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -36,6 +36,8 @@
>  
>  #define BUF_VECTOR_MAX 256
>  
> +#define VHOST_LOG_CACHE_NR 32
> +
>  /**
>   * Structure contains buffer address, length and descriptor index
>   * from vring to do scatter RX.
> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>  	uint64_t log_addr;
>  };
>  
> +/*
> + * Structure that contains the info for batched dirty logging.
> + */
> +struct log_cache_entry {
> +	uint32_t offset;
> +	unsigned long val;
> +};
> +
>  /**
>   * Structure contains variables relevant to RX/TX virtqueues.
>   */
> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>  	struct batch_copy_elem	*batch_copy_elems;
>  	uint16_t		batch_copy_nb_elems;
>  
> +	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
> +	uint16_t log_cache_nb_elem;
> +
>  	rte_rwlock_t	iotlb_lock;
>  	rte_rwlock_t	iotlb_pending_lock;
>  	struct rte_mempool *iotlb_pool;
> @@ -309,7 +322,15 @@ struct virtio_net {
>  static __rte_always_inline void
>  vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>  {
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)

Just curious, is there any reference about why
this version was chosen? Thanks!

> +		/*
> +		 * __sync_ built-ins are deprecated, but __atomic_ ones
> +		 * are sub-optimized in older GCC versions.
> +		 */

The indent isn't right (just need one tab here).

>  	__sync_fetch_and_or_8(addr, (1U << nr));
> +#else
> +	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
> +#endif
>  }
>  
>  static __rte_always_inline void
> @@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
>  	}
>  }
>  
> +static __rte_always_inline void
> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
> +	unsigned long *log_base;
> +	int i;
> +
> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> +		   !dev->log_base))
> +		return;
> +
> +	log_base = (unsigned long *)(uintptr_t)dev->log_base;
> +
> +	/* To make sure guest memory updates are committed before logging */
> +	rte_smp_wmb();
> +
> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
> +		struct log_cache_entry *elem = vq->log_cache + i;
> +
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> +		/*
> +		 * '__sync' builtins are deprecated, but '__atomic' ones
> +		 * are sub-optimized in older GCC versions.
> +		 */
> +		__sync_fetch_and_or(log_base + elem->offset, elem->val);
> +#else
> +		__atomic_fetch_or(log_base + elem->offset, elem->val,
> +				__ATOMIC_RELAXED);
> +#endif
> +	}
> +
> +	vq->log_cache_nb_elem = 0;
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +					 uint64_t page)
> +{
> +	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
> +	uint32_t offset = page / (sizeof(unsigned long) << 3);
> +	int i;
> +
> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
> +		struct log_cache_entry *elem = vq->log_cache + i;
> +
> +		if (elem->offset == offset) {
> +			elem->val |= (1U << bit_nr);

The val is unsigned long now, we need to use 1UL.

> +			return;
> +		}
> +	}
> +
> +	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
> +		/*
> +		 * No more room for a new log cache entry,
> +		 * so write the dirty log map directly.
> +		 */
> +		rte_smp_wmb();
> +		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
> +
> +		return;
> +	}
> +
> +	vq->log_cache[i].offset = offset;
> +	vq->log_cache[i].val = (1U << bit_nr);

Ditto.

> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +					  uint64_t addr, uint64_t len)

The 8 spaces width tabs are more widely used in DPDK.
And in below coding style document,

https://github.com/DPDK/dpdk/blob/master/doc/guides/contributing/coding_style.rst

The width of each level indent in most examples is 8
spaces. So maybe it's better to keep using 8 spaces
width tabs.

> +{
> +	uint64_t page;
> +
> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> +		   !dev->log_base || !len))
> +		return;
> +
> +	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
> +		return;
> +
> +	page = addr / VHOST_LOG_PAGE;
> +	while (page * VHOST_LOG_PAGE < addr + len) {
> +		vhost_log_cache_page(dev, vq, page);
> +		page += 1;
> +	}
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		     uint64_t offset, uint64_t len)
> +{
> +	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
> +}
> +
>  static __rte_always_inline void
>  vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  		     uint64_t offset, uint64_t len)
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index eed6b0227..76ec5f089 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	rte_memcpy(&vq->used->ring[to],
>  			&vq->shadow_used_ring[from],
>  			size * sizeof(struct vring_used_elem));
> -	vhost_log_used_vring(dev, vq,
> +	vhost_log_cache_used_vring(dev, vq,
>  			offsetof(struct vring_used, ring[to]),
>  			size * sizeof(struct vring_used_elem));
>  }
> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  
>  	rte_smp_wmb();
>  
> +	vhost_log_cache_sync(dev, vq);

Each time we call vhost_log_cache_sync(), there
is already a rte_smp_wmb() which is to protect
the used->idx update. So maybe there is no need
to call rte_smp_wmb() in vhost_log_cache_sync().

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance
  2018-05-16  6:21 ` Tiwei Bie
@ 2018-05-16  7:49   ` Maxime Coquelin
  2018-05-16  8:08     ` Tiwei Bie
  0 siblings, 1 reply; 4+ messages in thread
From: Maxime Coquelin @ 2018-05-16  7:49 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: dev, mst, stable



On 05/16/2018 08:21 AM, Tiwei Bie wrote:
> On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
>> This patch caches all dirty pages logging until the used ring index
>> is updated. These dirty pages won't be accessed by the guest as
>> long as the host doesn't give them back to it by updating the
>> index.
>>
>> The goal of this optimization is to fix a performance regression
>> introduced when the vhost library started to use atomic operations
>> to set bits in the shared dirty log map. While the fix was valid
>> as previous implementation wasn't safe against concurent accesses,
> 
> Typo: concurent

Right.

>> contention was induced.
>>
>> With this patch, during migration, we have:
>> 1. Less atomic operations as only a single atomic OR operation
>> per 32 or 64 (depending on CPU)  pages.
>> 2. Less atomic operations as during a burst, the same page will
>> be marked dirty only once.
>> 3. Less write memory barriers.
>>
>> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
>>
>> Cc: stable@dpdk.org
>>
>> Cc: Tiwei Bie <tiwei.bie@intel.com>
>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> 
> This is a nice approach! Thanks for the work!

Thanks for the review, it is really appreciated.

>> ---
>>   lib/librte_vhost/vhost.h      | 113 ++++++++++++++++++++++++++++++++++++++++++
>>   lib/librte_vhost/virtio_net.c |  29 +++++++----
>>   2 files changed, 132 insertions(+), 10 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index 891978131..8f6a41d7e 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -36,6 +36,8 @@
>>   
>>   #define BUF_VECTOR_MAX 256
>>   
>> +#define VHOST_LOG_CACHE_NR 32
>> +
>>   /**
>>    * Structure contains buffer address, length and descriptor index
>>    * from vring to do scatter RX.
>> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>>   	uint64_t log_addr;
>>   };
>>   
>> +/*
>> + * Structure that contains the info for batched dirty logging.
>> + */
>> +struct log_cache_entry {
>> +	uint32_t offset;
>> +	unsigned long val;
>> +};
>> +
>>   /**
>>    * Structure contains variables relevant to RX/TX virtqueues.
>>    */
>> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>>   	struct batch_copy_elem	*batch_copy_elems;
>>   	uint16_t		batch_copy_nb_elems;
>>   
>> +	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
>> +	uint16_t log_cache_nb_elem;
>> +
>>   	rte_rwlock_t	iotlb_lock;
>>   	rte_rwlock_t	iotlb_pending_lock;
>>   	struct rte_mempool *iotlb_pool;
>> @@ -309,7 +322,15 @@ struct virtio_net {
>>   static __rte_always_inline void
>>   vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>>   {
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> 
> Just curious, is there any reference about why
> this version was chosen? Thanks!

I googled Michael reference to the LWN article [0], and they mention GCC
7.1.

I haven't checked by myself though whether generated code is different 
in GCC >= 7.1.

[0]: https://lwn.net/Articles/691128/
> 
>> +		/*
>> +		 * __sync_ built-ins are deprecated, but __atomic_ ones
>> +		 * are sub-optimized in older GCC versions.
>> +		 */
> 
> The indent isn't right (just need one tab here).
Right, will fix.

> 
>>   	__sync_fetch_and_or_8(addr, (1U << nr));

This is unrelated to this patch set, but from GCC doc [1], shouldn't
we use __sync_fetch_and_or_1 as the size is in bytes?

[1]: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html

>> +#else
>> +	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
>> +#endif
>>   }
>>   
>>   static __rte_always_inline void
>> @@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
>>   	}
>>   }
>>   
>> +static __rte_always_inline void
>> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +	unsigned long *log_base;
>> +	int i;
>> +
>> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> +		   !dev->log_base))
>> +		return;
>> +
>> +	log_base = (unsigned long *)(uintptr_t)dev->log_base;
>> +
>> +	/* To make sure guest memory updates are committed before logging */
>> +	rte_smp_wmb();
>> +
>> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> +		struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
>> +		/*
>> +		 * '__sync' builtins are deprecated, but '__atomic' ones
>> +		 * are sub-optimized in older GCC versions.
>> +		 */
>> +		__sync_fetch_and_or(log_base + elem->offset, elem->val);
>> +#else
>> +		__atomic_fetch_or(log_base + elem->offset, elem->val,
>> +				__ATOMIC_RELAXED);
>> +#endif
>> +	}
>> +
>> +	vq->log_cache_nb_elem = 0;
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +					 uint64_t page)
>> +{
>> +	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
>> +	uint32_t offset = page / (sizeof(unsigned long) << 3);
>> +	int i;
>> +
>> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> +		struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +		if (elem->offset == offset) {
>> +			elem->val |= (1U << bit_nr);
> 
> The val is unsigned long now, we need to use 1UL.

Good catch!

>> +			return;
>> +		}
>> +	}
>> +
>> +	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
>> +		/*
>> +		 * No more room for a new log cache entry,
>> +		 * so write the dirty log map directly.
>> +		 */
>> +		rte_smp_wmb();
>> +		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
>> +
>> +		return;
>> +	}
>> +
>> +	vq->log_cache[i].offset = offset;
>> +	vq->log_cache[i].val = (1U << bit_nr);
> 
> Ditto.
> 
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +					  uint64_t addr, uint64_t len)
> 
> The 8 spaces width tabs are more widely used in DPDK.
> And in below coding style document,
> 
> https://github.com/DPDK/dpdk/blob/master/doc/guides/contributing/coding_style.rst
> 
> The width of each level indent in most examples is 8
> spaces. So maybe it's better to keep using 8 spaces
> width tabs.

Right, will fix this in v3.

>> +{
>> +	uint64_t page;
>> +
>> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> +		   !dev->log_base || !len))
>> +		return;
>> +
>> +	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
>> +		return;
>> +
>> +	page = addr / VHOST_LOG_PAGE;
>> +	while (page * VHOST_LOG_PAGE < addr + len) {
>> +		vhost_log_cache_page(dev, vq, page);
>> +		page += 1;
>> +	}
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +		     uint64_t offset, uint64_t len)
>> +{
>> +	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
>> +}
>> +
>>   static __rte_always_inline void
>>   vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   		     uint64_t offset, uint64_t len)
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index eed6b0227..76ec5f089 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   	rte_memcpy(&vq->used->ring[to],
>>   			&vq->shadow_used_ring[from],
>>   			size * sizeof(struct vring_used_elem));
>> -	vhost_log_used_vring(dev, vq,
>> +	vhost_log_cache_used_vring(dev, vq,
>>   			offsetof(struct vring_used, ring[to]),
>>   			size * sizeof(struct vring_used_elem));
>>   }
>> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>   
>>   	rte_smp_wmb();
>>   
>> +	vhost_log_cache_sync(dev, vq);
> 
> Each time we call vhost_log_cache_sync(), there
> is already a rte_smp_wmb() which is to protect
> the used->idx update. So maybe there is no need
> to call rte_smp_wmb() in vhost_log_cache_sync().

Right, I can remove it in vhost_log_cache_sync(), and
maybe add a comment there stating that a write barrier
before calling the function is expected.

Thanks,
Maxime
> Best regards,
> Tiwei Bie
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance
  2018-05-16  7:49   ` Maxime Coquelin
@ 2018-05-16  8:08     ` Tiwei Bie
  0 siblings, 0 replies; 4+ messages in thread
From: Tiwei Bie @ 2018-05-16  8:08 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: dev, mst, stable

On Wed, May 16, 2018 at 09:49:45AM +0200, Maxime Coquelin wrote:
> On 05/16/2018 08:21 AM, Tiwei Bie wrote:
> > On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
[...]
> > > @@ -309,7 +322,15 @@ struct virtio_net {
> > >   static __rte_always_inline void
> > >   vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
> > >   {
> > > +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> > 
> > Just curious, is there any reference about why
> > this version was chosen? Thanks!
> 
> I googled Michael reference to the LWN article [0], and they mention GCC
> 7.1.
> 
> I haven't checked by myself though whether generated code is different in
> GCC >= 7.1.
> 
> [0]: https://lwn.net/Articles/691128/

Thanks for the link!

> > 
> > > +		/*
> > > +		 * __sync_ built-ins are deprecated, but __atomic_ ones
> > > +		 * are sub-optimized in older GCC versions.
> > > +		 */
> > 
> > The indent isn't right (just need one tab here).
> Right, will fix.
> 
> > 
> > >   	__sync_fetch_and_or_8(addr, (1U << nr));
> 
> This is unrelated to this patch set, but from GCC doc [1], shouldn't
> we use __sync_fetch_and_or_1 as the size is in bytes?
> 
> [1]: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html

You are right. I didn't notice it.. Please also fix this.

> 
> > > +#else
> > > +	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
[...]
> > > @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
> > >   	rte_smp_wmb();
> > > +	vhost_log_cache_sync(dev, vq);
> > 
> > Each time we call vhost_log_cache_sync(), there
> > is already a rte_smp_wmb() which is to protect
> > the used->idx update. So maybe there is no need
> > to call rte_smp_wmb() in vhost_log_cache_sync().
> 
> Right, I can remove it in vhost_log_cache_sync(), and
> maybe add a comment there stating that a write barrier
> before calling the function is expected.

Good idea.

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-05-16  8:08 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-15 17:30 [dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance Maxime Coquelin
2018-05-16  6:21 ` Tiwei Bie
2018-05-16  7:49   ` Maxime Coquelin
2018-05-16  8:08     ` Tiwei Bie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).