patches for DPDK stable branches
 help / color / mirror / Atom feed
From: Maxime Coquelin <maxime.coquelin@redhat.com>
To: Tiwei Bie <tiwei.bie@intel.com>
Cc: dev@dpdk.org, mst@redhat.com, stable@dpdk.org
Subject: Re: [dpdk-stable] [PATCH v2] vhost: improve dirty pages logging performance
Date: Wed, 16 May 2018 09:49:45 +0200	[thread overview]
Message-ID: <465b4530-f19e-66e0-fc32-437595a3819e@redhat.com> (raw)
In-Reply-To: <20180516062107.GA26229@debian>



On 05/16/2018 08:21 AM, Tiwei Bie wrote:
> On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
>> This patch caches all dirty pages logging until the used ring index
>> is updated. These dirty pages won't be accessed by the guest as
>> long as the host doesn't give them back to it by updating the
>> index.
>>
>> The goal of this optimization is to fix a performance regression
>> introduced when the vhost library started to use atomic operations
>> to set bits in the shared dirty log map. While the fix was valid
>> as previous implementation wasn't safe against concurent accesses,
> 
> Typo: concurent

Right.

>> contention was induced.
>>
>> With this patch, during migration, we have:
>> 1. Less atomic operations as only a single atomic OR operation
>> per 32 or 64 (depending on CPU)  pages.
>> 2. Less atomic operations as during a burst, the same page will
>> be marked dirty only once.
>> 3. Less write memory barriers.
>>
>> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
>>
>> Cc: stable@dpdk.org
>>
>> Cc: Tiwei Bie <tiwei.bie@intel.com>
>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> 
> This is a nice approach! Thanks for the work!

Thanks for the review, it is really appreciated.

>> ---
>>   lib/librte_vhost/vhost.h      | 113 ++++++++++++++++++++++++++++++++++++++++++
>>   lib/librte_vhost/virtio_net.c |  29 +++++++----
>>   2 files changed, 132 insertions(+), 10 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index 891978131..8f6a41d7e 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -36,6 +36,8 @@
>>   
>>   #define BUF_VECTOR_MAX 256
>>   
>> +#define VHOST_LOG_CACHE_NR 32
>> +
>>   /**
>>    * Structure contains buffer address, length and descriptor index
>>    * from vring to do scatter RX.
>> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>>   	uint64_t log_addr;
>>   };
>>   
>> +/*
>> + * Structure that contains the info for batched dirty logging.
>> + */
>> +struct log_cache_entry {
>> +	uint32_t offset;
>> +	unsigned long val;
>> +};
>> +
>>   /**
>>    * Structure contains variables relevant to RX/TX virtqueues.
>>    */
>> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>>   	struct batch_copy_elem	*batch_copy_elems;
>>   	uint16_t		batch_copy_nb_elems;
>>   
>> +	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
>> +	uint16_t log_cache_nb_elem;
>> +
>>   	rte_rwlock_t	iotlb_lock;
>>   	rte_rwlock_t	iotlb_pending_lock;
>>   	struct rte_mempool *iotlb_pool;
>> @@ -309,7 +322,15 @@ struct virtio_net {
>>   static __rte_always_inline void
>>   vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>>   {
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> 
> Just curious, is there any reference about why
> this version was chosen? Thanks!

I googled Michael reference to the LWN article [0], and they mention GCC
7.1.

I haven't checked by myself though whether generated code is different 
in GCC >= 7.1.

[0]: https://lwn.net/Articles/691128/
> 
>> +		/*
>> +		 * __sync_ built-ins are deprecated, but __atomic_ ones
>> +		 * are sub-optimized in older GCC versions.
>> +		 */
> 
> The indent isn't right (just need one tab here).
Right, will fix.

> 
>>   	__sync_fetch_and_or_8(addr, (1U << nr));

This is unrelated to this patch set, but from GCC doc [1], shouldn't
we use __sync_fetch_and_or_1 as the size is in bytes?

[1]: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html

>> +#else
>> +	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
>> +#endif
>>   }
>>   
>>   static __rte_always_inline void
>> @@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
>>   	}
>>   }
>>   
>> +static __rte_always_inline void
>> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +	unsigned long *log_base;
>> +	int i;
>> +
>> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> +		   !dev->log_base))
>> +		return;
>> +
>> +	log_base = (unsigned long *)(uintptr_t)dev->log_base;
>> +
>> +	/* To make sure guest memory updates are committed before logging */
>> +	rte_smp_wmb();
>> +
>> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> +		struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
>> +		/*
>> +		 * '__sync' builtins are deprecated, but '__atomic' ones
>> +		 * are sub-optimized in older GCC versions.
>> +		 */
>> +		__sync_fetch_and_or(log_base + elem->offset, elem->val);
>> +#else
>> +		__atomic_fetch_or(log_base + elem->offset, elem->val,
>> +				__ATOMIC_RELAXED);
>> +#endif
>> +	}
>> +
>> +	vq->log_cache_nb_elem = 0;
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +					 uint64_t page)
>> +{
>> +	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
>> +	uint32_t offset = page / (sizeof(unsigned long) << 3);
>> +	int i;
>> +
>> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> +		struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +		if (elem->offset == offset) {
>> +			elem->val |= (1U << bit_nr);
> 
> The val is unsigned long now, we need to use 1UL.

Good catch!

>> +			return;
>> +		}
>> +	}
>> +
>> +	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
>> +		/*
>> +		 * No more room for a new log cache entry,
>> +		 * so write the dirty log map directly.
>> +		 */
>> +		rte_smp_wmb();
>> +		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
>> +
>> +		return;
>> +	}
>> +
>> +	vq->log_cache[i].offset = offset;
>> +	vq->log_cache[i].val = (1U << bit_nr);
> 
> Ditto.
> 
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +					  uint64_t addr, uint64_t len)
> 
> The 8 spaces width tabs are more widely used in DPDK.
> And in below coding style document,
> 
> https://github.com/DPDK/dpdk/blob/master/doc/guides/contributing/coding_style.rst
> 
> The width of each level indent in most examples is 8
> spaces. So maybe it's better to keep using 8 spaces
> width tabs.

Right, will fix this in v3.

>> +{
>> +	uint64_t page;
>> +
>> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> +		   !dev->log_base || !len))
>> +		return;
>> +
>> +	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
>> +		return;
>> +
>> +	page = addr / VHOST_LOG_PAGE;
>> +	while (page * VHOST_LOG_PAGE < addr + len) {
>> +		vhost_log_cache_page(dev, vq, page);
>> +		page += 1;
>> +	}
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +		     uint64_t offset, uint64_t len)
>> +{
>> +	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
>> +}
>> +
>>   static __rte_always_inline void
>>   vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   		     uint64_t offset, uint64_t len)
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index eed6b0227..76ec5f089 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   	rte_memcpy(&vq->used->ring[to],
>>   			&vq->shadow_used_ring[from],
>>   			size * sizeof(struct vring_used_elem));
>> -	vhost_log_used_vring(dev, vq,
>> +	vhost_log_cache_used_vring(dev, vq,
>>   			offsetof(struct vring_used, ring[to]),
>>   			size * sizeof(struct vring_used_elem));
>>   }
>> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>   
>>   	rte_smp_wmb();
>>   
>> +	vhost_log_cache_sync(dev, vq);
> 
> Each time we call vhost_log_cache_sync(), there
> is already a rte_smp_wmb() which is to protect
> the used->idx update. So maybe there is no need
> to call rte_smp_wmb() in vhost_log_cache_sync().

Right, I can remove it in vhost_log_cache_sync(), and
maybe add a comment there stating that a write barrier
before calling the function is expected.

Thanks,
Maxime
> Best regards,
> Tiwei Bie
> 

  reply	other threads:[~2018-05-16  7:49 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-15 17:30 Maxime Coquelin
2018-05-16  6:21 ` Tiwei Bie
2018-05-16  7:49   ` Maxime Coquelin [this message]
2018-05-16  8:08     ` Tiwei Bie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=465b4530-f19e-66e0-fc32-437595a3819e@redhat.com \
    --to=maxime.coquelin@redhat.com \
    --cc=dev@dpdk.org \
    --cc=mst@redhat.com \
    --cc=stable@dpdk.org \
    --cc=tiwei.bie@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).