From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga06.intel.com (mga06.intel.com [134.134.136.31]) by dpdk.org (Postfix) with ESMTP id 7D29156A3 for ; Thu, 31 May 2018 16:32:38 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga104.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 31 May 2018 07:32:37 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.49,463,1520924400"; d="scan'208";a="233284003" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by fmsmga005.fm.intel.com with ESMTP; 31 May 2018 07:32:35 -0700 Received: from sivswdev01.ir.intel.com (sivswdev01.ir.intel.com [10.237.217.45]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id w4VEWZRS003151; Thu, 31 May 2018 15:32:35 +0100 Received: from sivswdev01.ir.intel.com (localhost [127.0.0.1]) by sivswdev01.ir.intel.com with ESMTP id w4VEWYFx023522; Thu, 31 May 2018 15:32:34 +0100 Received: (from aburakov@localhost) by sivswdev01.ir.intel.com with LOCAL id w4VEWYF9023518; Thu, 31 May 2018 15:32:34 +0100 From: Anatoly Burakov To: dev@dpdk.org Cc: ray.kinsella@intel.com, kuralamudhan.ramakrishnan@intel.com, louise.m.daly@intel.com, bruce.richardson@intel.com, ferruh.yigit@intel.com, konstantin.ananyev@intel.com Date: Thu, 31 May 2018 15:32:33 +0100 Message-Id: <8b65d1c810721ef2ffe8019ec9504eb2112bb91e.1527776837.git.anatoly.burakov@intel.com> X-Mailer: git-send-email 1.7.0.7 In-Reply-To: References: In-Reply-To: References: Subject: [dpdk-dev] [RFC 10/10] mem: enable memfd-based hugepage allocation X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 31 May 2018 14:32:39 -0000 This will supplant no-shared-files mode to use memfd-based hugetlbfs allocation instead of hugetlbfs mounts. Due to memfd only being supported kernel 4.14+ and glibc 2.27+, a compile-time check is performed along with runtime checks. Signed-off-by: Anatoly Burakov --- .../linuxapp/eal/eal_hugepage_info.c | 136 ++++++++++++++---- lib/librte_eal/linuxapp/eal/eal_memalloc.c | 105 +++++++++++++- lib/librte_eal/linuxapp/eal/eal_memfd.h | 28 ++++ lib/librte_eal/linuxapp/eal/eal_memory.c | 4 +- 4 files changed, 234 insertions(+), 39 deletions(-) create mode 100644 lib/librte_eal/linuxapp/eal/eal_memfd.h diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 02b1c4ff1..1a80ee0ee 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -30,6 +30,7 @@ #include "eal_internal_cfg.h" #include "eal_hugepages.h" #include "eal_filesystem.h" +#include "eal_memfd.h" static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; @@ -313,11 +314,85 @@ compare_hpi(const void *a, const void *b) return hpi_b->hugepage_sz - hpi_a->hugepage_sz; } +static void +calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) +{ + uint64_t total_pages = 0; + unsigned int i; + + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) { + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + } +} + +static int +check_memfd_pagesize_supported(uint64_t page_sz) +{ +#ifdef MEMFD_SUPPORTED + int sz_flag, fd; + + /* first, check if this particular pagesize is supported */ + sz_flag = eal_memalloc_get_memfd_pagesize_flag(page_sz); + if (sz_flag == 0) { + RTE_LOG(ERR, EAL, "Unexpected memfd hugepage size: %" + PRIu64" bytes\n", page_sz); + return 0; + } + + /* does currently running kernel support it? */ + fd = memfd_create("memfd_test", sz_flag | MFD_HUGETLB); + if (fd >= 0) { + /* success */ + close(fd); + return 1; + } + /* creating memfd failed, but if the error wasn't EINVAL, reserving of + * hugepages via memfd is supported by the kernel + */ + if (errno != EINVAL) { + return 1; + } + RTE_LOG(DEBUG, EAL, "Kernel does not support memfd hugepages of size %" + PRIu64" bytes\n", page_sz); +#else + RTE_LOG(DEBUG, EAL, "Memfd hugepage support not enabled at compile time\n"); + RTE_SET_USED(page_sz); +#endif + return 0; +} + static int hugepage_info_init(void) { const char dirent_start_text[] = "hugepages-"; const size_t dirent_start_len = sizeof(dirent_start_text) - 1; - unsigned int i, total_pages, num_sizes = 0; + unsigned int i, num_sizes = 0; DIR *dir; struct dirent *dirent; @@ -343,6 +418,10 @@ hugepage_info_init(void) hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]); + /* by default, memfd_hugepage_supported is 1 */ + memfd_hugepage_supported &= + check_memfd_pagesize_supported(hpi->hugepage_sz); + /* first, check if we have a mountpoint */ if (get_hugepage_dir(hpi->hugepage_sz, hpi->hugedir, sizeof(hpi->hugedir)) < 0) { @@ -355,6 +434,23 @@ hugepage_info_init(void) "%" PRIu64 " reserved, but no mounted " "hugetlbfs found for that size\n", num_pages, hpi->hugepage_sz); + + /* no shared files mode may still be able to allocate + * without a valid mountpoint via memfd, but we cannot + * use memfd in legacy mode, because we cannot sort + * pages, so only allow empty mountpoints in non-legacy + * mode. + */ + if (internal_config.no_shared_files && + !internal_config.legacy_mem && + memfd_hugepage_supported) { + RTE_LOG(NOTICE, EAL, "No shared files mode enabled, " + "hugepages of size %" PRIu64 " bytes " + "will be allocated anonymously\n", + hpi->hugepage_sz); + calc_num_pages(hpi, dirent); + num_sizes++; + } continue; } @@ -371,35 +467,14 @@ hugepage_info_init(void) if (clear_hugedir(hpi->hugedir) == -1) break; - /* - * first, try to put all hugepages into relevant sockets, but - * if first attempts fails, fall back to collecting all pages - * in one socket and sorting them later - */ - total_pages = 0; - /* we also don't want to do this for legacy init */ - if (!internal_config.legacy_mem) - for (i = 0; i < rte_socket_count(); i++) { - int socket = rte_socket_id_by_idx(i); - unsigned int num_pages = - get_num_hugepages_on_node( - dirent->d_name, socket); - hpi->num_pages[socket] = num_pages; - total_pages += num_pages; - } - /* - * we failed to sort memory from the get go, so fall - * back to old way - */ - if (total_pages == 0) - hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + calc_num_pages(hpi, dirent); -#ifndef RTE_ARCH_64 - /* for 32-bit systems, limit number of hugepages to - * 1GB per page size */ - hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], - RTE_PGSIZE_1G / hpi->hugepage_sz); -#endif + if (internal_config.no_shared_files && + !internal_config.legacy_mem && + memfd_hugepage_supported) + RTE_LOG(NOTICE, EAL, "No shared files mode enabled, " + "hugepages of size %" PRIu64 " bytes will be " + "allocated anonymously\n", hpi->hugepage_sz); num_sizes++; } @@ -423,8 +498,7 @@ hugepage_info_init(void) for (j = 0; j < RTE_MAX_NUMA_NODES; j++) num_pages += hpi->num_pages[j]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 && - num_pages > 0) + if (num_pages > 0) return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index f57d307dd..c4d57c349 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -39,6 +39,7 @@ #include "eal_filesystem.h" #include "eal_internal_cfg.h" #include "eal_memalloc.h" +#include "eal_memfd.h" /* * not all kernel version support fallocate on hugetlbfs, so fall back to @@ -46,6 +47,11 @@ */ static int fallocate_supported = -1; /* unknown */ +/* not all kernel versions support memfd hugepages. assume supported unless + * shown otherwise. + */ +int memfd_hugepage_supported = 1; + /* for single-file segments, we need some kind of mechanism to keep track of * which hugepages can be freed back to the system, and which cannot. we cannot * use flock() because they don't allow locking parts of a file, and we cannot @@ -293,6 +299,49 @@ static int unlock_segment(int list_idx, int seg_idx) return 0; } +int +eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz) +{ +#ifdef MEMFD_SUPPORTED + switch (page_sz) { + case RTE_PGSIZE_1G: + return MFD_HUGE_1GB; + case RTE_PGSIZE_2M: + return MFD_HUGE_2MB; + default: + return -1; + } +#endif + return 0; +} + +static int +get_memfd_seg_fd(unsigned int list_idx, + unsigned int seg_idx, int sz_flag) +{ +#ifdef MEMFD_SUPPORTED + int flags = MFD_HUGETLB | sz_flag; + char name[64]; + int fd; + + snprintf(name, sizeof(name) - 1, "memseg-%d-%d", list_idx, + seg_idx); + + fd = memfd_create(name, flags); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't create memfd hugepage: %s\n", + strerror(errno)); + return -1; + } + return fd; +#else + RTE_SET_USED(list_idx); + RTE_SET_USED(seg_idx); + RTE_SET_USED(sz_flag); + return -1; +#endif +} + static int get_seg_fd(char *path, int buflen, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) @@ -342,6 +391,27 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi, return fd; } +static int +get_seg_fd_no_shared(char *path, int buflen, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + int sz_flag; + + /* if memfd hugepages are not supported, create regular files */ + if (memfd_hugepage_supported == 0) + return get_seg_fd(path, buflen, hi, list_idx, seg_idx); + + /* pick correct page size flags */ + sz_flag = eal_memalloc_get_memfd_pagesize_flag(hi->hugepage_sz); + if (sz_flag == 0) { + RTE_LOG(ERR, EAL, "Unexpected page size: %" + PRIu64 "\n", hi->hugepage_sz); + return -1; + } + + return get_memfd_seg_fd(list_idx, seg_idx, sz_flag); +} + static int resize_hugefile(int fd, char *path, int list_idx, int seg_idx, uint64_t fa_offset, uint64_t page_sz, bool grow) @@ -491,8 +561,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, int fd; size_t alloc_sz; - /* takes out a read lock on segment or segment list */ - fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (internal_config.no_shared_files) { + /* if allocating memfd hugepages is supported, do that, + * otherwise fallback to regular allocation + */ + fd = get_seg_fd_no_shared(path, sizeof(path), hi, list_idx, + seg_idx); + } else { + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + } if (fd < 0) { RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); return -1; @@ -512,7 +590,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, __func__, strerror(errno)); goto resized; } - if (internal_config.no_shared_files) { + if (internal_config.no_shared_files && + memfd_hugepage_supported == 0) { if (unlink(path)) { RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", __func__, strerror(errno)); @@ -616,7 +695,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, { uint64_t map_offset; char path[PATH_MAX]; - int fd, ret; + int fd, ret = 0; /* erase page data */ memset(ms->addr, 0, ms->len); @@ -685,6 +764,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) size_t page_sz; int cur_idx, start_idx, j, dir_fd = -1; unsigned int msl_idx, need, i; + bool mountpoint_is_empty; if (msl->page_sz != wa->page_sz) return 0; @@ -704,6 +784,12 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) return 0; start_idx = cur_idx; + /* if we're in no-shared-files mode and memfd is supported, we will + * allow empty mountpoints because memfd doesn't require a mountpoint. + */ + mountpoint_is_empty = + strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0; + /* do not allow any page allocations during the time we're allocating, * because file creation and locking operations are not atomic, * and we might be the first or the last ones to use a particular page, @@ -712,7 +798,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) * during init, we already hold a write lock, so don't try to take out * another one. */ - if (wa->hi->lock_descriptor == -1) { + if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) { dir_fd = open(wa->hi->hugedir, O_RDONLY); if (dir_fd < 0) { RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", @@ -794,6 +880,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) struct free_walk_param *wa = arg; uintptr_t start_addr, end_addr; int msl_idx, seg_idx, ret, dir_fd = -1; + bool mountpoint_is_empty; start_addr = (uintptr_t) msl->base_va; end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; @@ -802,6 +889,12 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) (uintptr_t)wa->ms->addr >= end_addr) return 0; + /* if we're in no shared files mode and memfd is supported, we will + * allow empty mountpoints because memfd doesn't require a mountpoint. + */ + mountpoint_is_empty = + strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0; + msl_idx = msl - mcfg->memsegs; seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; @@ -816,7 +909,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) * during init, we already hold a write lock, so don't try to take out * another one. */ - if (wa->hi->lock_descriptor == -1) { + if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) { dir_fd = open(wa->hi->hugedir, O_RDONLY); if (dir_fd < 0) { RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", diff --git a/lib/librte_eal/linuxapp/eal/eal_memfd.h b/lib/librte_eal/linuxapp/eal/eal_memfd.h new file mode 100644 index 000000000..55e6dbb2c --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_memfd.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef EAL_MEMFD_H +#define EAL_MEMFD_H + +#include + +/* + * For memfd hugepages, both kernel and glibc version must support them. So, + * check for both. + */ +#include /* glibc version */ +#if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 27 +#include /* linux kernel version */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +#define MEMFD_SUPPORTED +#include +#endif /* linux version check */ +#endif /* glibc version check */ + +int +eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz); + +extern int memfd_hugepage_supported; + +#endif /* EAL_MEMFD_H */ diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index d7b43b5c1..b26e21be8 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -44,6 +44,7 @@ #include "eal_internal_cfg.h" #include "eal_filesystem.h" #include "eal_hugepages.h" +#include "eal_memfd.h" #define PFN_MASK_SIZE 8 @@ -1060,8 +1061,7 @@ get_socket_mem_size(int socket) for (i = 0; i < internal_config.num_hugepage_sizes; i++){ struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) - size += hpi->hugepage_sz * hpi->num_pages[socket]; + size += hpi->hugepage_sz * hpi->num_pages[socket]; } return size; -- 2.17.0