From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by dpdk.org (Postfix) with ESMTP id AE890C73A for ; Fri, 29 Jan 2016 19:22:30 +0100 (CET) Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by fmsmga101.fm.intel.com with ESMTP; 29 Jan 2016 10:22:29 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.22,365,1449561600"; d="scan'208";a="643616997" Received: from dpdk06.sh.intel.com ([10.239.128.225]) by FMSMGA003.fm.intel.com with ESMTP; 29 Jan 2016 10:22:28 -0800 From: Jianfeng Tan To: dev@dpdk.org Date: Fri, 29 Jan 2016 19:22:02 +0800 Message-Id: <1454066522-80045-1-git-send-email-jianfeng.tan@intel.com> X-Mailer: git-send-email 2.1.4 In-Reply-To: <1453661393-85704-1-git-send-email-jianfeng.tan@intel.com> References: <1453661393-85704-1-git-send-email-jianfeng.tan@intel.com> Subject: [dpdk-dev] [PATCH] eal: make resource initialization more robust X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 29 Jan 2016 18:22:31 -0000 Current issue: DPDK is not that friendly to container environment, which caused by that it pre-alloc resource like cores and hugepages. But there are this or that resource limitations, for examples, cgroup, rlimit, cpuset, etc. For cores, this patch makes use of pthread_getaffinity_np to further narrow down detected cores before parsing coremask (-c), corelist (-l), and coremap (--lcores). For hugepages, this patch adds a recover mechanism to the case that there are no that many hugepages can be used. It relys on a mem access to fault-in hugepages, and if fails with SIGBUS, recover to previously saved stack environment with siglongjmp(). Test example: a. cgcreate -g cpuset,hugetlb:/test-subgroup b. cgset -r cpuset.cpus=2-3 test-subgroup c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup d. cgexec -g cpuset,hugetlb:test-subgroup \ ./examples/l2fwd/build/l2fwd -n 4 -- -p 3 Signed-off-by: Jianfeng Tan --- lib/librte_eal/common/eal_common_lcore.c | 10 +++- lib/librte_eal/linuxapp/eal/eal_memory.c | 78 ++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c index a4263ba..8e9c675 100644 --- a/lib/librte_eal/common/eal_common_lcore.c +++ b/lib/librte_eal/common/eal_common_lcore.c @@ -57,6 +57,13 @@ rte_eal_cpu_init(void) struct rte_config *config = rte_eal_get_configuration(); unsigned lcore_id; unsigned count = 0; + rte_cpuset_t cpuset; + pthread_t tid; + + tid = pthread_self(); + if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cpuset) != 0) + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + CPU_SET(lcore_id, &cpuset); /* * Parse the maximum set of logical cores, detect the subset of running @@ -70,7 +77,8 @@ rte_eal_cpu_init(void) /* in 1:1 mapping, record related cpu detected state */ lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id); - if (lcore_config[lcore_id].detected == 0) { + if (lcore_config[lcore_id].detected == 0 || + !CPU_ISSET(lcore_id, &cpuset)) { config->lcore_role[lcore_id] = ROLE_OFF; lcore_config[lcore_id].core_index = -1; continue; diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 846fd31..837fd9e 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -80,6 +80,8 @@ #include #include #include +#include +#include #include #include @@ -309,6 +311,12 @@ get_virtual_area(size_t *size, size_t hugepage_sz) return addr; } +static sigjmp_buf jmpenv; + +static void sigbus_handler(int signo __rte_unused) +{ + siglongjmp(jmpenv, 1); +} /* * Mmap all hugepages of hugepage table: it first open a file in * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the @@ -396,7 +404,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, if (fd < 0) { RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, strerror(errno)); - return -1; + return i; } virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE, @@ -405,11 +413,26 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno)); close(fd); - return -1; + return i; } if (orig) { hugepg_tbl[i].orig_va = virtaddr; + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (sigsetjmp(jmpenv, 0)) { + RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + return i; + } memset(virtaddr, 0, hugepage_sz); } else { @@ -421,7 +444,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n", __func__, strerror(errno)); close(fd); - return -1; + return i; } close(fd); @@ -429,7 +452,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, vma_addr = (char *)vma_addr + hugepage_sz; vma_len -= hugepage_sz; } - return 0; + return i; } #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS @@ -1075,6 +1098,31 @@ calc_num_pages_per_socket(uint64_t * memory, return total_num_pages; } +static struct sigaction action_old; +static int need_recover = 0; + +static void +register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = sigbus_handler; + + need_recover = !sigaction(SIGBUS, &action, &action_old); +} + +static void +recover_sigbus(void) +{ + if (need_recover) + sigaction(SIGBUS, &action_old, NULL); +} + /* * Prepare physical memory mapping: fill configuration structure with * these infos, return 0 on success. @@ -1161,8 +1209,11 @@ rte_eal_hugepage_init(void) hp_offset = 0; /* where we start the current page size entries */ + register_sigbus(); + /* map all hugepages and sort them */ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + int pages_old, pages_new; struct hugepage_info *hpi; /* @@ -1176,10 +1227,19 @@ rte_eal_hugepage_init(void) continue; /* map all hugepages available */ - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ - RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", - (unsigned)(hpi->hugepage_sz / 0x100000)); - goto fail; + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + internal_config.memory -= + hpi->hugepage_sz * (pages_old - pages_new); + nr_hugepages -= (pages_old - pages_new); + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; } /* find physical addresses and sockets for each hugepage */ @@ -1226,6 +1286,8 @@ rte_eal_hugepage_init(void) #endif } + recover_sigbus(); + #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS nr_hugefiles = 0; for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { -- 2.1.4