From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga14.intel.com (mga14.intel.com [192.55.52.115]) by dpdk.org (Postfix) with ESMTP id E1572DED for ; Tue, 28 Aug 2018 10:12:41 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga006.jf.intel.com ([10.7.209.51]) by fmsmga103.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 28 Aug 2018 01:12:41 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.53,298,1531810800"; d="scan'208";a="69672615" Received: from soeby-mobl.ger.corp.intel.com (HELO [10.252.19.107]) ([10.252.19.107]) by orsmga006.jf.intel.com with ESMTP; 28 Aug 2018 01:12:39 -0700 To: Tiwei Bie , maxime.coquelin@redhat.com, zhihong.wang@intel.com, dev@dpdk.org References: <20180828075327.18842-1-tiwei.bie@intel.com> From: "Burakov, Anatoly" Message-ID: <986b3012-552b-6673-d716-9acacb5e9528@intel.com> Date: Tue, 28 Aug 2018 09:12:38 +0100 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Thunderbird/52.9.1 MIME-Version: 1.0 In-Reply-To: <20180828075327.18842-1-tiwei.bie@intel.com> Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit Subject: Re: [dpdk-dev] [RFC] net/virtio-user: avoid parsing process mappings X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 28 Aug 2018 08:12:42 -0000 On 28-Aug-18 8:53 AM, Tiwei Bie wrote: > Recently some memory APIs were introduced to allow users > to get the file descriptors for each memory segments. We > can leverage those APIs to get rid of the /proc magic on > memory table preparation in vhost-user backend. > > Signed-off-by: Tiwei Bie > --- > This patch depends on below patch set: > https://patches.dpdk.org/project/dpdk/list/?series=1040 Great to see this done so quickly! > > drivers/net/virtio/virtio_user/vhost_user.c | 214 ++++++++---------- > .../net/virtio/virtio_user/virtio_user_dev.c | 19 ++ > 2 files changed, 112 insertions(+), 121 deletions(-) > > diff --git a/drivers/net/virtio/virtio_user/vhost_user.c b/drivers/net/virtio/virtio_user/vhost_user.c > index ef6e43df8..770cfdc9a 100644 > --- a/drivers/net/virtio/virtio_user/vhost_user.c > +++ b/drivers/net/virtio/virtio_user/vhost_user.c > @@ -11,6 +11,9 @@ > #include > #include > > +#include > +#include > + > #include "vhost.h" > #include "virtio_user_dev.h" > > @@ -121,133 +124,106 @@ vhost_user_read(int fd, struct vhost_user_msg *msg) > return -1; > } > > -struct hugepage_file_info { > - uint64_t addr; /**< virtual addr */ > - size_t size; /**< the file size */ > - char path[PATH_MAX]; /**< path to backing file */ > +struct walk_arg { > + struct vhost_memory *vm; > + int *fds; > + int region_nr; > }; > > -/* Two possible options: > - * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file > - * array. This is simple but cannot be used in secondary process because > - * secondary process will close and munmap that file. > - * 2. Match HUGEFILE_FMT to find hugepage files directly. > - * > - * We choose option 2. > - */ > static int > -get_hugepage_file_info(struct hugepage_file_info huges[], int max) > +update_memory_region(const struct rte_memseg_list *msl __rte_unused, > + const struct rte_memseg *ms, void *arg) > { > - int idx, k, exist; > - FILE *f; > - char buf[BUFSIZ], *tmp, *tail; > - char *str_underline, *str_start; > - int huge_index; > - uint64_t v_start, v_end; > - struct stat stats; > - > - f = fopen("/proc/self/maps", "r"); > - if (!f) { > - PMD_DRV_LOG(ERR, "cannot open /proc/self/maps"); > - return -1; > - } > - > - idx = 0; > - while (fgets(buf, sizeof(buf), f) != NULL) { > - if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) { > - PMD_DRV_LOG(ERR, "Failed to parse address"); > - goto error; > - } > - > - tmp = strchr(buf, ' ') + 1; /** skip address */ > - tmp = strchr(tmp, ' ') + 1; /** skip perm */ > - tmp = strchr(tmp, ' ') + 1; /** skip offset */ > - tmp = strchr(tmp, ' ') + 1; /** skip dev */ > - tmp = strchr(tmp, ' ') + 1; /** skip inode */ > - while (*tmp == ' ') /** skip spaces */ > - tmp++; > - tail = strrchr(tmp, '\n'); /** remove newline if exists */ > - if (tail) > - *tail = '\0'; > - > - /* Match HUGEFILE_FMT, aka "%s/%smap_%d", > - * which is defined in eal_filesystem.h > - */ > - str_underline = strrchr(tmp, '_'); > - if (!str_underline) > - continue; > - > - str_start = str_underline - strlen("map"); > - if (str_start < tmp) > - continue; > - > - if (sscanf(str_start, "map_%d", &huge_index) != 1) > - continue; > - > - /* skip duplicated file which is mapped to different regions */ > - for (k = 0, exist = -1; k < idx; ++k) { > - if (!strcmp(huges[k].path, tmp)) { > - exist = k; > - break; > - } > - } > - if (exist >= 0) > - continue; > - > - if (idx >= max) { > - PMD_DRV_LOG(ERR, "Exceed maximum of %d", max); > - goto error; > - } > - > - huges[idx].addr = v_start; > - huges[idx].size = v_end - v_start; /* To be corrected later */ > - snprintf(huges[idx].path, PATH_MAX, "%s", tmp); > - idx++; > - } > - > - /* correct the size for files who have many regions */ > - for (k = 0; k < idx; ++k) { > - if (stat(huges[k].path, &stats) < 0) { > - PMD_DRV_LOG(ERR, "Failed to stat %s, %s\n", > - huges[k].path, strerror(errno)); > - continue; > - } > - huges[k].size = stats.st_size; > - PMD_DRV_LOG(INFO, "file %s, size %zx\n", > - huges[k].path, huges[k].size); > - } > - > - fclose(f); > - return idx; > - > -error: > - fclose(f); > - return -1; > -} > - > -static int > -prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) > -{ > - int i, num; > - struct hugepage_file_info huges[VHOST_MEMORY_MAX_NREGIONS]; > + struct walk_arg *wa = arg; > struct vhost_memory_region *mr; > + uint64_t start_addr, end_addr, offset; > + int i, fd; > > - num = get_hugepage_file_info(huges, VHOST_MEMORY_MAX_NREGIONS); > - if (num < 0) { > - PMD_INIT_LOG(ERR, "Failed to prepare memory for vhost-user"); > + fd = rte_memseg_get_fd_thread_unsafe(ms); > + if (fd < 0) { > + PMD_DRV_LOG(ERR, "Failed to get fd, ms=%p", ms); > return -1; > } > > - for (i = 0; i < num; ++i) { > - mr = &msg->payload.memory.regions[i]; > - mr->guest_phys_addr = huges[i].addr; /* use vaddr! */ > - mr->userspace_addr = huges[i].addr; > - mr->memory_size = huges[i].size; > - mr->mmap_offset = 0; > - fds[i] = open(huges[i].path, O_RDWR); > + start_addr = (uint64_t)(uintptr_t)ms->addr; > + end_addr = start_addr + ms->len; > + > + /* > + * XXX workaround! > + * > + * For --single-file-segments case, offset should be: > + * offset = rte_fbarray_find_idx(&msl->memseg_arr, ms) * msl->page_sz; > + * > + * But it's not true for non-single-file-segments case. > + * > + * This is a temporary workaround which assumes the file will > + * also be mapped from the beginning in --single-file-segments > + * case. It should be replaced when we have a memory API to > + * get the offset. Yes, this is an unfortunate consequence of having split personalities in memory subsystem. A good solution would be to just deprecate non-single-file segments mode and always use it, but we cannot do that because we support kernel versions that do not support fallocate() on hugetlbfs (and it still leaves legacy mem mode, which effectively behaves like non-single-file segments as far as virtio is concerned). How about we add this API in v2 for seg fd patchset? > + */ > + offset = 0; > + > + for (i = 0; i < wa->region_nr; i++) { > + if (wa->fds[i] != fd) > + continue; > + > + mr = &wa->vm->regions[i]; > + > + if (mr->userspace_addr + mr->memory_size < end_addr) > + mr->memory_size = end_addr - mr->userspace_addr; > + > + if (mr->userspace_addr > start_addr) { > + mr->userspace_addr = start_addr; > + mr->guest_phys_addr = start_addr; > + } > + > + if (mr->mmap_offset > offset) > + mr->mmap_offset = offset; > + > + PMD_DRV_LOG(DEBUG, "index=%d fd=%d offset=0x%" PRIx64 > + " addr=0x%" PRIx64 " len=%" PRIu64, i, fd, > + mr->mmap_offset, mr->userspace_addr, > + mr->memory_size); > + > + return 0; > + } > + > + if (i >= VHOST_MEMORY_MAX_NREGIONS) { > + PMD_DRV_LOG(ERR, "Too many memory regions"); > + return -1; > } > > - msg->payload.memory.nregions = num; > + mr = &wa->vm->regions[i]; > + wa->fds[i] = fd; > + > + mr->guest_phys_addr = start_addr; > + mr->userspace_addr = start_addr; > + mr->memory_size = ms->len; > + mr->mmap_offset = offset; > + > + PMD_DRV_LOG(DEBUG, "index=%d fd=%d offset=0x%" PRIx64 > + " addr=0x%" PRIx64 " len=%" PRIu64, i, fd, > + mr->mmap_offset, mr->userspace_addr, > + mr->memory_size); > + > + wa->region_nr++; > + > + return 0; > +} > + > +static int > +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) > +{ > + struct walk_arg wa; > + > + wa.region_nr = 0; > + wa.vm = &msg->payload.memory; > + wa.fds = fds; > + > + if (rte_memseg_walk_thread_unsafe(update_memory_region, &wa) < 0) > + return -1; > + > + msg->payload.memory.nregions = wa.region_nr; > msg->payload.memory.padding = 0; > > return 0; > @@ -280,7 +256,7 @@ vhost_user_sock(struct virtio_user_dev *dev, > int need_reply = 0; > int fds[VHOST_MEMORY_MAX_NREGIONS]; > int fd_num = 0; > - int i, len; > + int len; > int vhostfd = dev->vhostfd; > > RTE_SET_USED(m); > @@ -364,10 +340,6 @@ vhost_user_sock(struct virtio_user_dev *dev, > return -1; > } > > - if (req == VHOST_USER_SET_MEM_TABLE) > - for (i = 0; i < fd_num; ++i) > - close(fds[i]); > - > if (need_reply) { > if (vhost_user_read(vhostfd, &msg) < 0) { > PMD_DRV_LOG(ERR, "Received msg failed: %s", > diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c > index 7df600b02..869e96f87 100644 > --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c > +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c > @@ -13,6 +13,8 @@ > #include > #include > > +#include > + > #include "vhost.h" > #include "virtio_user_dev.h" > #include "../virtio_ethdev.h" > @@ -109,9 +111,24 @@ is_vhost_user_by_type(const char *path) > int > virtio_user_start_device(struct virtio_user_dev *dev) > { > + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; > uint64_t features; > int ret; > > + /* > + * XXX workaround! > + * > + * We need to make sure that the locks will be > + * taken in the correct order to avoid deadlocks. > + * > + * Before releasing this lock, this thread should > + * not trigger any memory hotplug events. > + * > + * This is a temporary workaround, and should be > + * replaced when we get proper supports from the > + * memory subsystem in the future. > + */ > + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); This feels like it should be a separate patch, because it fixes the issue that exists independently of whether we have segment fd API or not. > pthread_mutex_lock(&dev->mutex); > > if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0) > @@ -152,10 +169,12 @@ virtio_user_start_device(struct virtio_user_dev *dev) > > dev->started = true; > pthread_mutex_unlock(&dev->mutex); > + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); > > return 0; > error: > pthread_mutex_unlock(&dev->mutex); > + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); > /* TODO: free resource here or caller to check */ > return -1; > } > -- Thanks, Anatoly