Re: [PATCH v2] vhost: fix madvise arguments alignment

DPDK patches and discussions
 help / color / mirror / Atom feed

From: Maxime Coquelin <maxime.coquelin@redhat.com>
To: Mike Pattrick <mkp@redhat.com>, dev@dpdk.org
Cc: david.marchand@redhat.com, chenbo.xia@intel.com
Subject: Re: [PATCH v2] vhost: fix madvise arguments alignment
Date: Thu, 23 Feb 2023 17:12:14 +0100	[thread overview]
Message-ID: <e347d80e-9a8f-b2da-9d54-d87bf98d9cc5@redhat.com> (raw)
In-Reply-To: <20230223043551.1108541-1-mkp@redhat.com>

Hi Mike,

Thanks for  looking into this issue.

On 2/23/23 05:35, Mike Pattrick wrote:
> The arguments passed to madvise should be aligned to the alignment of
> the backing memory. Now we keep track of each regions alignment and use
> then when setting coredump preferences. To facilitate this, a new member
> was added to rte_vhost_mem_region. A new function was added to easily
> translate memory address back to region alignment. Unneeded calls to
> madvise were reduced, as the cache removal case should already be
> covered by the cache insertion case. The previously inline function
> mem_set_dump was removed from a header file and made not inline.
> 
> Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps")
> 
> Signed-off-by: Mike Pattrick <mkp@redhat.com>
> ---
> Since v1:
>   - Corrected a cast for 32bit compiles
> ---
>   lib/vhost/iotlb.c      |  9 +++---
>   lib/vhost/rte_vhost.h  |  1 +
>   lib/vhost/vhost.h      | 12 ++------
>   lib/vhost/vhost_user.c | 63 +++++++++++++++++++++++++++++++++++-------
>   4 files changed, 60 insertions(+), 25 deletions(-)
> 
> diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c
> index a0b8fd7302..5293507b63 100644
> --- a/lib/vhost/iotlb.c
> +++ b/lib/vhost/iotlb.c
> @@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq)
>   	rte_rwlock_write_lock(&vq->iotlb_lock);
>   
>   	RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
> -		mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);

Hmm, it should have been called with enable=false here since we are
removing the entry from the IOTLB cache. It should be kept in order to
"DONTDUMP" pages evicted from the cache.

>   		TAILQ_REMOVE(&vq->iotlb_list, node, next);
>   		vhost_user_iotlb_pool_put(vq, node);
>   	}
> @@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq)
>   
>   	RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
>   		if (!entry_idx) {
> -			mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);

Same here.

>   			TAILQ_REMOVE(&vq->iotlb_list, node, next);
>   			vhost_user_iotlb_pool_put(vq, node);
>   			vq->iotlb_cache_nr--;
> @@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq
>   			vhost_user_iotlb_pool_put(vq, new_node);
>   			goto unlock;
>   		} else if (node->iova > new_node->iova) {
> -			mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);
> +			mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true,
> +				hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr));
>   			TAILQ_INSERT_BEFORE(node, new_node, next);
>   			vq->iotlb_cache_nr++;
>   			goto unlock;
>   		}
>   	}
>   
> -	mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);
> +	mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true,
> +		hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr));
>   	TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next);
>   	vq->iotlb_cache_nr++;
>   
> @@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
>   			break;
>   
>   		if (iova < node->iova + node->size) {
> -			mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);
>   			TAILQ_REMOVE(&vq->iotlb_list, node, next);
>   			vhost_user_iotlb_pool_put(vq, node);
>   			vq->iotlb_cache_nr--;
> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> index a395843fe9..c5c97ea67e 100644
> --- a/lib/vhost/rte_vhost.h
> +++ b/lib/vhost/rte_vhost.h
> @@ -136,6 +136,7 @@ struct rte_vhost_mem_region {
>   	void	 *mmap_addr;
>   	uint64_t mmap_size;
>   	int fd;
> +	uint64_t alignment;

This is not possible to do this as it breaks the ABI.
You have to store the information somewhere else, or simply call
get_blk_size() in hua_to_alignment() since the fd is not closed.

>   };
>   
>   /**
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 5750f0c005..a2467ba509 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m)
>   	return true;
>   }
>   
> -static __rte_always_inline void
> -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable)
> -{
> -#ifdef MADV_DONTDUMP
> -	if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) {
> -		rte_log(RTE_LOG_INFO, vhost_config_log_level,
> -			"VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno));
> -	}
> -#endif
> -}
> +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
> +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
>   #endif /* _VHOST_NET_CDEV_H_ */
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> index d702d082dd..6d09597fbe 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq)
>   	return log_gpa;
>   }
>   
> +uint64_t
> +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr)
> +{
> +	struct rte_vhost_mem_region *r;
> +	uint32_t i;
> +	uintptr_t hua = (uintptr_t)ptr;
> +
> +	for (i = 0; i < mem->nregions; i++) {
> +		r = &mem->regions[i];
> +		if (hua >= r->host_user_addr &&
> +			hua < r->host_user_addr + r->size) {
> +			return r->alignment;
> +		}
> +	}
> +
> +	/* If region isn't found, don't align at all */
> +	return 1;
> +}
> +
> +void
> +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz)
> +{
> +#ifdef MADV_DONTDUMP
> +	void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz);
> +	uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz);
> +	size_t len = end - (uintptr_t)start;
> +
> +	if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) {
> +		rte_log(RTE_LOG_INFO, vhost_config_log_level,
> +			"VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno));
> +	}
> +#endif
> +}
> +
>   static void
>   translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   {
> @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   			return;
>   		}
>   
> +		mem_set_dump(vq->desc_packed, len, true,
> +			hua_to_alignment(dev->mem, vq->desc_packed));
>   		numa_realloc(&dev, &vq);
>   		*pdev = dev;
>   		*pvq = vq;
> @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   			return;
>   		}
>   
> +		mem_set_dump(vq->driver_event, len, true,
> +			hua_to_alignment(dev->mem, vq->driver_event));
>   		len = sizeof(struct vring_packed_desc_event);
>   		vq->device_event = (struct vring_packed_desc_event *)
>   					(uintptr_t)ring_addr_to_vva(dev,
> @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   			return;
>   		}
>   
> -		mem_set_dump(vq->desc_packed, len, true);
> -		mem_set_dump(vq->driver_event, len, true);
> -		mem_set_dump(vq->device_event, len, true);
> +		mem_set_dump(vq->device_event, len, true,
> +			hua_to_alignment(dev->mem, vq->device_event));
>   		vq->access_ok = true;
>   		return;
>   	}
> @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   		return;
>   	}
>   
> +	mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc));
>   	numa_realloc(&dev, &vq);
>   	*pdev = dev;
>   	*pvq = vq;
> @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   		return;
>   	}
>   
> +	mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail));
>   	len = sizeof(struct vring_used) +
>   		sizeof(struct vring_used_elem) * vq->size;
>   	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
> @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   		return;
>   	}
>   
> +	mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used));
> +
>   	if (vq->last_used_idx != vq->used->idx) {
>   		VHOST_LOG_CONFIG(dev->ifname, WARNING,
>   			"last_used_idx (%u) and vq->used->idx (%u) mismatches;\n",
> @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
>   			"some packets maybe resent for Tx and dropped for Rx\n");
>   	}
>   
> -	mem_set_dump(vq->desc, len, true);
> -	mem_set_dump(vq->avail, len, true);
> -	mem_set_dump(vq->used, len, true);
>   	vq->access_ok = true;
>   
>   	VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc);
> @@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev,
>   	region->mmap_addr = mmap_addr;
>   	region->mmap_size = mmap_size;
>   	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
> -	mem_set_dump(mmap_addr, mmap_size, false);
> +	region->alignment = alignment;
> +	mem_set_dump(mmap_addr, mmap_size, false, alignment);
>   
>   	if (dev->async_copy) {
>   		if (add_guest_pages(dev, region, alignment) < 0) {
> @@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f
>   		return NULL;
>   	}
>   
> -	mem_set_dump(ptr, size, false);
>   	*fd = mfd;
>   	return ptr;
>   }
> @@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
>   	uint64_t pervq_inflight_size, mmap_size;
>   	uint16_t num_queues, queue_size;
>   	struct virtio_net *dev = *pdev;
> +	uint64_t alignment;
>   	int fd, i, j;
>   	int numa_node = SOCKET_ID_ANY;
>   	void *addr;
> @@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
>   		dev->inflight_info->fd = -1;
>   	}
>   
> +	alignment = get_blk_size(fd);
> +	mem_set_dump(addr, mmap_size, false, alignment);
>   	dev->inflight_info->addr = addr;
>   	dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size;
>   	dev->inflight_info->fd = ctx->fds[0] = fd;
> @@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev,
>   		dev->inflight_info->fd = -1;
>   	}
>   
> -	mem_set_dump(addr, mmap_size, false);
>   	dev->inflight_info->fd = fd;
>   	dev->inflight_info->addr = addr;
>   	dev->inflight_info->size = mmap_size;
> +	mem_set_dump(addr, mmap_size, false, get_blk_size(fd));
>   
>   	for (i = 0; i < num_queues; i++) {
>   		vq = dev->virtqueue[i];
> @@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev,
>   	struct virtio_net *dev = *pdev;
>   	int fd = ctx->fds[0];
>   	uint64_t size, off;
> +	uint64_t alignment;
>   	void *addr;
>   	uint32_t i;
>   
> @@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev,
>   	 * fail when offset is not page size aligned.
>   	 */
>   	addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> +	alignment = get_blk_size(fd);
>   	close(fd);
>   	if (addr == MAP_FAILED) {
>   		VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n");
> @@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev,
>   	dev->log_addr = (uint64_t)(uintptr_t)addr;
>   	dev->log_base = dev->log_addr + off;
>   	dev->log_size = size;
> -	mem_set_dump(addr, size, false);
> +	mem_set_dump(addr, size + off, false, alignment);
>   
>   	for (i = 0; i < dev->nr_vring; i++) {
>   		struct vhost_virtqueue *vq = dev->virtqueue[i];

next prev parent reply	other threads:[~2023-02-23 16:12 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-06 15:05 [PATCH v2] vhost: exclude VM hugepages from coredumps Mike Pattrick
2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick
2022-12-20 12:43   ` Maxime Coquelin
2023-02-03 14:50   ` Maxime Coquelin
2023-02-10 15:53   ` David Marchand
2023-02-10 15:59     ` Maxime Coquelin
2023-02-10 21:12     ` Mike Pattrick
2023-02-21 16:26       ` Maxime Coquelin
2023-02-22 13:59         ` Mike Pattrick
2023-02-22 21:43         ` [PATCH] vhost: fix madvise arguments alignment Mike Pattrick
2023-02-23  4:35           ` [PATCH v2] " Mike Pattrick
2023-02-23 16:12             ` Maxime Coquelin [this message]
2023-02-23 16:57               ` Mike Pattrick
2023-02-24 15:05                 ` Patrick Robb
2023-03-01 20:02             ` [PATCH v3] " Mike Pattrick
2023-03-02 10:19               ` Maxime Coquelin
2023-03-06 14:22               ` Maxime Coquelin
2023-02-03 17:45 ` [PATCH v2] vhost: exclude VM hugepages from coredumps Stephen Hemminger
2023-02-06  1:13   ` Mike Pattrick

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e347d80e-9a8f-b2da-9d54-d87bf98d9cc5@redhat.com \
    --to=maxime.coquelin@redhat.com \
    --cc=chenbo.xia@intel.com \
    --cc=david.marchand@redhat.com \
    --cc=dev@dpdk.org \
    --cc=mkp@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).