DPDK patches and discussions
 help / color / mirror / Atom feed
From: "Burakov, Anatoly" <anatoly.burakov@intel.com>
To: Tiwei Bie <tiwei.bie@intel.com>,
	maxime.coquelin@redhat.com, zhihong.wang@intel.com, dev@dpdk.org
Subject: Re: [dpdk-dev] [RFC] net/virtio-user: avoid parsing process mappings
Date: Tue, 28 Aug 2018 09:12:38 +0100	[thread overview]
Message-ID: <986b3012-552b-6673-d716-9acacb5e9528@intel.com> (raw)
In-Reply-To: <20180828075327.18842-1-tiwei.bie@intel.com>

On 28-Aug-18 8:53 AM, Tiwei Bie wrote:
> Recently some memory APIs were introduced to allow users
> to get the file descriptors for each memory segments. We
> can leverage those APIs to get rid of the /proc magic on
> memory table preparation in vhost-user backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
> This patch depends on below patch set:
> https://patches.dpdk.org/project/dpdk/list/?series=1040

Great to see this done so quickly!

> 
>   drivers/net/virtio/virtio_user/vhost_user.c   | 214 ++++++++----------
>   .../net/virtio/virtio_user/virtio_user_dev.c  |  19 ++
>   2 files changed, 112 insertions(+), 121 deletions(-)
> 
> diff --git a/drivers/net/virtio/virtio_user/vhost_user.c b/drivers/net/virtio/virtio_user/vhost_user.c
> index ef6e43df8..770cfdc9a 100644
> --- a/drivers/net/virtio/virtio_user/vhost_user.c
> +++ b/drivers/net/virtio/virtio_user/vhost_user.c
> @@ -11,6 +11,9 @@
>   #include <string.h>
>   #include <errno.h>
>   
> +#include <rte_fbarray.h>
> +#include <rte_eal_memconfig.h>
> +
>   #include "vhost.h"
>   #include "virtio_user_dev.h"
>   
> @@ -121,133 +124,106 @@ vhost_user_read(int fd, struct vhost_user_msg *msg)
>   	return -1;
>   }
>   
> -struct hugepage_file_info {
> -	uint64_t addr;            /**< virtual addr */
> -	size_t   size;            /**< the file size */
> -	char     path[PATH_MAX];  /**< path to backing file */
> +struct walk_arg {
> +	struct vhost_memory *vm;
> +	int *fds;
> +	int region_nr;
>   };
>   
> -/* Two possible options:
> - * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
> - * array. This is simple but cannot be used in secondary process because
> - * secondary process will close and munmap that file.
> - * 2. Match HUGEFILE_FMT to find hugepage files directly.
> - *
> - * We choose option 2.
> - */
>   static int
> -get_hugepage_file_info(struct hugepage_file_info huges[], int max)
> +update_memory_region(const struct rte_memseg_list *msl __rte_unused,
> +		const struct rte_memseg *ms, void *arg)
>   {
> -	int idx, k, exist;
> -	FILE *f;
> -	char buf[BUFSIZ], *tmp, *tail;
> -	char *str_underline, *str_start;
> -	int huge_index;
> -	uint64_t v_start, v_end;
> -	struct stat stats;
> -
> -	f = fopen("/proc/self/maps", "r");
> -	if (!f) {
> -		PMD_DRV_LOG(ERR, "cannot open /proc/self/maps");
> -		return -1;
> -	}
> -
> -	idx = 0;
> -	while (fgets(buf, sizeof(buf), f) != NULL) {
> -		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
> -			PMD_DRV_LOG(ERR, "Failed to parse address");
> -			goto error;
> -		}
> -
> -		tmp = strchr(buf, ' ') + 1; /** skip address */
> -		tmp = strchr(tmp, ' ') + 1; /** skip perm */
> -		tmp = strchr(tmp, ' ') + 1; /** skip offset */
> -		tmp = strchr(tmp, ' ') + 1; /** skip dev */
> -		tmp = strchr(tmp, ' ') + 1; /** skip inode */
> -		while (*tmp == ' ')         /** skip spaces */
> -			tmp++;
> -		tail = strrchr(tmp, '\n');  /** remove newline if exists */
> -		if (tail)
> -			*tail = '\0';
> -
> -		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
> -		 * which is defined in eal_filesystem.h
> -		 */
> -		str_underline = strrchr(tmp, '_');
> -		if (!str_underline)
> -			continue;
> -
> -		str_start = str_underline - strlen("map");
> -		if (str_start < tmp)
> -			continue;
> -
> -		if (sscanf(str_start, "map_%d", &huge_index) != 1)
> -			continue;
> -
> -		/* skip duplicated file which is mapped to different regions */
> -		for (k = 0, exist = -1; k < idx; ++k) {
> -			if (!strcmp(huges[k].path, tmp)) {
> -				exist = k;
> -				break;
> -			}
> -		}
> -		if (exist >= 0)
> -			continue;
> -
> -		if (idx >= max) {
> -			PMD_DRV_LOG(ERR, "Exceed maximum of %d", max);
> -			goto error;
> -		}
> -
> -		huges[idx].addr = v_start;
> -		huges[idx].size = v_end - v_start; /* To be corrected later */
> -		snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
> -		idx++;
> -	}
> -
> -	/* correct the size for files who have many regions */
> -	for (k = 0; k < idx; ++k) {
> -		if (stat(huges[k].path, &stats) < 0) {
> -			PMD_DRV_LOG(ERR, "Failed to stat %s, %s\n",
> -				    huges[k].path, strerror(errno));
> -			continue;
> -		}
> -		huges[k].size = stats.st_size;
> -		PMD_DRV_LOG(INFO, "file %s, size %zx\n",
> -			    huges[k].path, huges[k].size);
> -	}
> -
> -	fclose(f);
> -	return idx;
> -
> -error:
> -	fclose(f);
> -	return -1;
> -}
> -
> -static int
> -prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
> -{
> -	int i, num;
> -	struct hugepage_file_info huges[VHOST_MEMORY_MAX_NREGIONS];
> +	struct walk_arg *wa = arg;
>   	struct vhost_memory_region *mr;
> +	uint64_t start_addr, end_addr, offset;
> +	int i, fd;
>   
> -	num = get_hugepage_file_info(huges, VHOST_MEMORY_MAX_NREGIONS);
> -	if (num < 0) {
> -		PMD_INIT_LOG(ERR, "Failed to prepare memory for vhost-user");
> +	fd = rte_memseg_get_fd_thread_unsafe(ms);
> +	if (fd < 0) {
> +		PMD_DRV_LOG(ERR, "Failed to get fd, ms=%p", ms);
>   		return -1;
>   	}
>   
> -	for (i = 0; i < num; ++i) {
> -		mr = &msg->payload.memory.regions[i];
> -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
> -		mr->userspace_addr = huges[i].addr;
> -		mr->memory_size = huges[i].size;
> -		mr->mmap_offset = 0;
> -		fds[i] = open(huges[i].path, O_RDWR);
> +	start_addr = (uint64_t)(uintptr_t)ms->addr;
> +	end_addr = start_addr + ms->len;
> +
> +	/*
> +	 * XXX workaround!
> +	 *
> +	 * For --single-file-segments case, offset should be:
> +	 * offset = rte_fbarray_find_idx(&msl->memseg_arr, ms) * msl->page_sz;
> +	 *
> +	 * But it's not true for non-single-file-segments case.
> +	 *
> +	 * This is a temporary workaround which assumes the file will
> +	 * also be mapped from the beginning in --single-file-segments
> +	 * case. It should be replaced when we have a memory API to
> +	 * get the offset.

Yes, this is an unfortunate consequence of having split personalities in 
memory subsystem. A good solution would be to just deprecate 
non-single-file segments mode and always use it, but we cannot do that 
because we support kernel versions that do not support fallocate() on 
hugetlbfs (and it still leaves legacy mem mode, which effectively 
behaves like non-single-file segments as far as virtio is concerned).

How about we add this API in v2 for seg fd patchset?

> +	 */
> +	offset = 0;
> +
> +	for (i = 0; i < wa->region_nr; i++) {
> +		if (wa->fds[i] != fd)
> +			continue;
> +
> +		mr = &wa->vm->regions[i];
> +
> +		if (mr->userspace_addr + mr->memory_size < end_addr)
> +			mr->memory_size = end_addr - mr->userspace_addr;
> +
> +		if (mr->userspace_addr > start_addr) {
> +			mr->userspace_addr = start_addr;
> +			mr->guest_phys_addr = start_addr;
> +		}
> +
> +		if (mr->mmap_offset > offset)
> +			mr->mmap_offset = offset;
> +
> +		PMD_DRV_LOG(DEBUG, "index=%d fd=%d offset=0x%" PRIx64
> +			" addr=0x%" PRIx64 " len=%" PRIu64, i, fd,
> +			mr->mmap_offset, mr->userspace_addr,
> +			mr->memory_size);
> +
> +		return 0;
> +	}
> +
> +	if (i >= VHOST_MEMORY_MAX_NREGIONS) {
> +		PMD_DRV_LOG(ERR, "Too many memory regions");
> +		return -1;
>   	}
>   
> -	msg->payload.memory.nregions = num;
> +	mr = &wa->vm->regions[i];
> +	wa->fds[i] = fd;
> +
> +	mr->guest_phys_addr = start_addr;
> +	mr->userspace_addr = start_addr;
> +	mr->memory_size = ms->len;
> +	mr->mmap_offset = offset;
> +
> +	PMD_DRV_LOG(DEBUG, "index=%d fd=%d offset=0x%" PRIx64
> +		" addr=0x%" PRIx64 " len=%" PRIu64, i, fd,
> +		mr->mmap_offset, mr->userspace_addr,
> +		mr->memory_size);
> +
> +	wa->region_nr++;
> +
> +	return 0;
> +}
> +
> +static int
> +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
> +{
> +	struct walk_arg wa;
> +
> +	wa.region_nr = 0;
> +	wa.vm = &msg->payload.memory;
> +	wa.fds = fds;
> +
> +	if (rte_memseg_walk_thread_unsafe(update_memory_region, &wa) < 0)
> +		return -1;
> +
> +	msg->payload.memory.nregions = wa.region_nr;
>   	msg->payload.memory.padding = 0;
>   
>   	return 0;
> @@ -280,7 +256,7 @@ vhost_user_sock(struct virtio_user_dev *dev,
>   	int need_reply = 0;
>   	int fds[VHOST_MEMORY_MAX_NREGIONS];
>   	int fd_num = 0;
> -	int i, len;
> +	int len;
>   	int vhostfd = dev->vhostfd;
>   
>   	RTE_SET_USED(m);
> @@ -364,10 +340,6 @@ vhost_user_sock(struct virtio_user_dev *dev,
>   		return -1;
>   	}
>   
> -	if (req == VHOST_USER_SET_MEM_TABLE)
> -		for (i = 0; i < fd_num; ++i)
> -			close(fds[i]);
> -
>   	if (need_reply) {
>   		if (vhost_user_read(vhostfd, &msg) < 0) {
>   			PMD_DRV_LOG(ERR, "Received msg failed: %s",
> diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> index 7df600b02..869e96f87 100644
> --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> @@ -13,6 +13,8 @@
>   #include <sys/types.h>
>   #include <sys/stat.h>
>   
> +#include <rte_eal_memconfig.h>
> +
>   #include "vhost.h"
>   #include "virtio_user_dev.h"
>   #include "../virtio_ethdev.h"
> @@ -109,9 +111,24 @@ is_vhost_user_by_type(const char *path)
>   int
>   virtio_user_start_device(struct virtio_user_dev *dev)
>   {
> +	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
>   	uint64_t features;
>   	int ret;
>   
> +	/*
> +	 * XXX workaround!
> +	 *
> +	 * We need to make sure that the locks will be
> +	 * taken in the correct order to avoid deadlocks.
> +	 *
> +	 * Before releasing this lock, this thread should
> +	 * not trigger any memory hotplug events.
> +	 *
> +	 * This is a temporary workaround, and should be
> +	 * replaced when we get proper supports from the
> +	 * memory subsystem in the future.
> +	 */
> +	rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);

This feels like it should be a separate patch, because it fixes the 
issue that exists independently of whether we have segment fd API or not.

>   	pthread_mutex_lock(&dev->mutex);
>   
>   	if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0)
> @@ -152,10 +169,12 @@ virtio_user_start_device(struct virtio_user_dev *dev)
>   
>   	dev->started = true;
>   	pthread_mutex_unlock(&dev->mutex);
> +	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>   
>   	return 0;
>   error:
>   	pthread_mutex_unlock(&dev->mutex);
> +	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>   	/* TODO: free resource here or caller to check */
>   	return -1;
>   }
> 


-- 
Thanks,
Anatoly

  reply	other threads:[~2018-08-28  8:12 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-28  7:53 Tiwei Bie
2018-08-28  8:12 ` Burakov, Anatoly [this message]
2018-08-28  8:42   ` Tiwei Bie
2018-08-28 12:56     ` Burakov, Anatoly

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=986b3012-552b-6673-d716-9acacb5e9528@intel.com \
    --to=anatoly.burakov@intel.com \
    --cc=dev@dpdk.org \
    --cc=maxime.coquelin@redhat.com \
    --cc=tiwei.bie@intel.com \
    --cc=zhihong.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).