From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga14.intel.com (mga14.intel.com [192.55.52.115]) by dpdk.org (Postfix) with ESMTP id 29D792952 for ; Fri, 4 Mar 2016 18:58:49 +0100 (CET) Received: from orsmga003.jf.intel.com ([10.7.209.27]) by fmsmga103.fm.intel.com with ESMTP; 04 Mar 2016 09:58:48 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.22,536,1449561600"; d="scan'208";a="758219312" Received: from dpdk06.sh.intel.com ([10.239.128.225]) by orsmga003.jf.intel.com with ESMTP; 04 Mar 2016 09:58:46 -0800 From: Jianfeng Tan To: dev@dpdk.org Date: Fri, 4 Mar 2016 18:58:12 +0800 Message-Id: <1457089092-4128-1-git-send-email-jianfeng.tan@intel.com> X-Mailer: git-send-email 2.1.4 In-Reply-To: <1453661393-85704-1-git-send-email-jianfeng.tan@intel.com> References: <1453661393-85704-1-git-send-email-jianfeng.tan@intel.com> Subject: [dpdk-dev] [PATCH] eal: make hugetlb initialization more robust X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 04 Mar 2016 17:58:49 -0000 This patch adds an option, --huge-trybest, to use a recover mechanism to the case that there are not so many hugepages (declared in sysfs), which can be used. It relys on a mem access to fault-in hugepages, and if fails with SIGBUS, recover to previously saved stack environment with siglongjmp(). Test example: a. cgcreate -g hugetlb:/test-subgroup b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup c. cgexec -g hugetlb:test-subgroup \ ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest Signed-off-by: Jianfeng Tan Acked-by: Neil Horman --- lib/librte_eal/common/eal_common_options.c | 4 ++ lib/librte_eal/common/eal_internal_cfg.h | 1 + lib/librte_eal/common/eal_options.h | 2 + lib/librte_eal/linuxapp/eal/eal.c | 1 + lib/librte_eal/linuxapp/eal/eal_memory.c | 95 +++++++++++++++++++++++++++--- 5 files changed, 95 insertions(+), 8 deletions(-) diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index 29942ea..8ff6a2e 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -95,6 +95,7 @@ eal_long_options[] = { {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM }, {OPT_XEN_DOM0, 0, NULL, OPT_XEN_DOM0_NUM }, + {OPT_HUGE_TRYBEST, 0, NULL, OPT_HUGE_TRYBEST_NUM }, {0, 0, NULL, 0 } }; @@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg, return -1; } break; + case OPT_HUGE_TRYBEST_NUM: + internal_config.huge_trybest = 1; + break; /* don't know what to do, leave this to caller */ default: diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h index 5f1367e..90a3533 100644 --- a/lib/librte_eal/common/eal_internal_cfg.h +++ b/lib/librte_eal/common/eal_internal_cfg.h @@ -64,6 +64,7 @@ struct internal_config { volatile unsigned force_nchannel; /**< force number of channels */ volatile unsigned force_nrank; /**< force number of ranks */ volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */ + volatile unsigned huge_trybest; /**< try best to allocate hugepages */ unsigned hugepage_unlink; /**< true to unlink backing files */ volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/ volatile unsigned no_pci; /**< true to disable PCI */ diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h index a881c62..02397c5 100644 --- a/lib/librte_eal/common/eal_options.h +++ b/lib/librte_eal/common/eal_options.h @@ -83,6 +83,8 @@ enum { OPT_VMWARE_TSC_MAP_NUM, #define OPT_XEN_DOM0 "xen-dom0" OPT_XEN_DOM0_NUM, +#define OPT_HUGE_TRYBEST "huge-trybest" + OPT_HUGE_TRYBEST_NUM, OPT_LONG_MAX_NUM }; diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index ceac435..3e23877 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -343,6 +343,7 @@ eal_usage(const char *prgname) " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" " --"OPT_XEN_DOM0" Support running on Xen dom0 without hugetlbfs\n" + " --"OPT_HUGE_TRYBEST" Try best to accommodate hugepages\n" "\n"); /* Allow the application to print its usage message too if hook is set */ if ( rte_application_usage_hook ) { diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 5b9132c..1766d7f 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -80,6 +80,8 @@ #include #include #include +#include +#include #include #include @@ -309,6 +311,12 @@ get_virtual_area(size_t *size, size_t hugepage_sz) return addr; } +static sigjmp_buf jmpenv; + +static void sigbus_handler(int signo __rte_unused) +{ + siglongjmp(jmpenv, 1); +} /* * Mmap all hugepages of hugepage table: it first open a file in * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the @@ -396,7 +404,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, if (fd < 0) { RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, strerror(errno)); - return -1; + return i; } /* map the segment, and populate page tables, @@ -407,7 +415,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno)); close(fd); - return -1; + return i; } if (orig) { @@ -417,12 +425,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, hugepg_tbl[i].final_va = virtaddr; } + if (orig && internal_config.huge_trybest) { + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (sigsetjmp(jmpenv, 0)) { + RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); + return i; + } + *(int *)virtaddr = 0; + } + + /* set shared flock on the file. */ if (flock(fd, LOCK_SH | LOCK_NB) == -1) { RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n", __func__, strerror(errno)); close(fd); - return -1; + return i; } close(fd); @@ -430,7 +459,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, vma_addr = (char *)vma_addr + hugepage_sz; vma_len -= hugepage_sz; } - return 0; + return i; } #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS @@ -1036,6 +1065,33 @@ calc_num_pages_per_socket(uint64_t * memory, return total_num_pages; } +static struct sigaction action_old; +static int need_recover; + +static void +register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = sigbus_handler; + + need_recover = !sigaction(SIGBUS, &action, &action_old); +} + +static void +recover_sigbus(void) +{ + if (need_recover) { + sigaction(SIGBUS, &action_old, NULL); + need_recover = 0; + } +} + /* * Prepare physical memory mapping: fill configuration structure with * these infos, return 0 on success. @@ -1122,8 +1178,12 @@ rte_eal_hugepage_init(void) hp_offset = 0; /* where we start the current page size entries */ + if (internal_config.huge_trybest) + register_sigbus(); + /* map all hugepages and sort them */ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + int pages_old, pages_new; struct hugepage_info *hpi; /* @@ -1137,10 +1197,24 @@ rte_eal_hugepage_init(void) continue; /* map all hugepages available */ - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ - RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", - (unsigned)(hpi->hugepage_sz / 0x100000)); - goto fail; + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + if (internal_config.huge_trybest) { + int pages = pages_old - pages_new; + + internal_config.memory -= + hpi->hugepage_sz * pages; + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; + } else + goto fail; } /* find physical addresses and sockets for each hugepage */ @@ -1187,6 +1261,9 @@ rte_eal_hugepage_init(void) #endif } + if (internal_config.huge_trybest) + recover_sigbus(); + #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS nr_hugefiles = 0; for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { @@ -1373,6 +1450,8 @@ rte_eal_hugepage_init(void) return 0; fail: + if (internal_config.huge_trybest) + recover_sigbus(); free(tmp_hp); return -1; } -- 2.1.4