From: Jianfeng Tan <jianfeng.tan@intel.com>
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH] eal: make hugetlb initialization more robust
Date: Fri, 4 Mar 2016 18:58:12 +0800 [thread overview]
Message-ID: <1457089092-4128-1-git-send-email-jianfeng.tan@intel.com> (raw)
In-Reply-To: <1453661393-85704-1-git-send-email-jianfeng.tan@intel.com>
This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().
Test example:
a. cgcreate -g hugetlb:/test-subgroup
b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
c. cgexec -g hugetlb:test-subgroup \
./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
lib/librte_eal/common/eal_common_options.c | 4 ++
lib/librte_eal/common/eal_internal_cfg.h | 1 +
lib/librte_eal/common/eal_options.h | 2 +
lib/librte_eal/linuxapp/eal/eal.c | 1 +
lib/librte_eal/linuxapp/eal/eal_memory.c | 95 +++++++++++++++++++++++++++---
5 files changed, 95 insertions(+), 8 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..8ff6a2e 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
{OPT_XEN_DOM0, 0, NULL, OPT_XEN_DOM0_NUM },
+ {OPT_HUGE_TRYBEST, 0, NULL, OPT_HUGE_TRYBEST_NUM },
{0, 0, NULL, 0 }
};
@@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
return -1;
}
break;
+ case OPT_HUGE_TRYBEST_NUM:
+ internal_config.huge_trybest = 1;
+ break;
/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
volatile unsigned force_nchannel; /**< force number of channels */
volatile unsigned force_nrank; /**< force number of ranks */
volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */
+ volatile unsigned huge_trybest; /**< try best to allocate hugepages */
unsigned hugepage_unlink; /**< true to unlink backing files */
volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
volatile unsigned no_pci; /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
OPT_VMWARE_TSC_MAP_NUM,
#define OPT_XEN_DOM0 "xen-dom0"
OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST "huge-trybest"
+ OPT_HUGE_TRYBEST_NUM,
OPT_LONG_MAX_NUM
};
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ceac435..3e23877 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
" --"OPT_XEN_DOM0" Support running on Xen dom0 without hugetlbfs\n"
+ " --"OPT_HUGE_TRYBEST" Try best to accommodate hugepages\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..1766d7f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
#include <errno.h>
#include <sys/ioctl.h>
#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
#include <rte_log.h>
#include <rte_memory.h>
@@ -309,6 +311,12 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
return addr;
}
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+ siglongjmp(jmpenv, 1);
+}
/*
* Mmap all hugepages of hugepage table: it first open a file in
* hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -396,7 +404,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
if (fd < 0) {
RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
strerror(errno));
- return -1;
+ return i;
}
/* map the segment, and populate page tables,
@@ -407,7 +415,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
strerror(errno));
close(fd);
- return -1;
+ return i;
}
if (orig) {
@@ -417,12 +425,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
hugepg_tbl[i].final_va = virtaddr;
}
+ if (orig && internal_config.huge_trybest) {
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (sigsetjmp(jmpenv, 0)) {
+ RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
+ "hugepages of size %u MB\n",
+ (unsigned)(hugepage_sz / 0x100000));
+ munmap(virtaddr, hugepage_sz);
+ close(fd);
+ unlink(hugepg_tbl[i].filepath);
+ return i;
+ }
+ *(int *)virtaddr = 0;
+ }
+
+
/* set shared flock on the file. */
if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
__func__, strerror(errno));
close(fd);
- return -1;
+ return i;
}
close(fd);
@@ -430,7 +459,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
vma_addr = (char *)vma_addr + hugepage_sz;
vma_len -= hugepage_sz;
}
- return 0;
+ return i;
}
#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1065,33 @@ calc_num_pages_per_socket(uint64_t * memory,
return total_num_pages;
}
+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = 0;
+ action.sa_mask = mask;
+ action.sa_handler = sigbus_handler;
+
+ need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+ if (need_recover) {
+ sigaction(SIGBUS, &action_old, NULL);
+ need_recover = 0;
+ }
+}
+
/*
* Prepare physical memory mapping: fill configuration structure with
* these infos, return 0 on success.
@@ -1122,8 +1178,12 @@ rte_eal_hugepage_init(void)
hp_offset = 0; /* where we start the current page size entries */
+ if (internal_config.huge_trybest)
+ register_sigbus();
+
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+ int pages_old, pages_new;
struct hugepage_info *hpi;
/*
@@ -1137,10 +1197,24 @@ rte_eal_hugepage_init(void)
continue;
/* map all hugepages available */
- if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
- RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
- (unsigned)(hpi->hugepage_sz / 0x100000));
- goto fail;
+ pages_old = hpi->num_pages[0];
+ pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+ if (pages_new < pages_old) {
+ RTE_LOG(DEBUG, EAL,
+ "%d not %d hugepages of size %u MB allocated\n",
+ pages_new, pages_old,
+ (unsigned)(hpi->hugepage_sz / 0x100000));
+ if (internal_config.huge_trybest) {
+ int pages = pages_old - pages_new;
+
+ internal_config.memory -=
+ hpi->hugepage_sz * pages;
+ nr_hugepages -= pages;
+ hpi->num_pages[0] = pages_new;
+ if (pages_new == 0)
+ continue;
+ } else
+ goto fail;
}
/* find physical addresses and sockets for each hugepage */
@@ -1187,6 +1261,9 @@ rte_eal_hugepage_init(void)
#endif
}
+ if (internal_config.huge_trybest)
+ recover_sigbus();
+
#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
nr_hugefiles = 0;
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1450,8 @@ rte_eal_hugepage_init(void)
return 0;
fail:
+ if (internal_config.huge_trybest)
+ recover_sigbus();
free(tmp_hp);
return -1;
}
--
2.1.4
next prev parent reply other threads:[~2016-03-04 17:58 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-24 18:49 [dpdk-dev] [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
2016-01-25 13:46 ` Neil Horman
2016-01-26 2:22 ` Tan, Jianfeng
2016-01-26 14:19 ` Neil Horman
2016-01-27 12:02 ` Tan, Jianfeng
2016-01-27 17:30 ` Neil Horman
2016-01-29 11:22 ` [dpdk-dev] [PATCH] eal: make resource initialization more robust Jianfeng Tan
2016-02-01 18:08 ` Neil Horman
2016-02-22 6:08 ` Tan, Jianfeng
2016-02-22 13:18 ` Neil Horman
2016-02-28 21:12 ` Thomas Monjalon
2016-02-29 1:50 ` Tan, Jianfeng
2016-03-04 10:05 ` [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
2016-03-08 8:54 ` Panu Matilainen
2016-03-08 17:38 ` Tan, Jianfeng
2016-03-09 13:05 ` Panu Matilainen
2016-03-09 13:53 ` Tan, Jianfeng
2016-03-09 14:01 ` Ananyev, Konstantin
2016-03-09 14:17 ` Tan, Jianfeng
2016-03-09 14:44 ` Ananyev, Konstantin
2016-03-09 14:55 ` Tan, Jianfeng
2016-03-09 15:17 ` Ananyev, Konstantin
2016-03-09 17:45 ` Tan, Jianfeng
2016-03-09 19:33 ` Ananyev, Konstantin
2016-03-10 1:36 ` Tan, Jianfeng
2016-05-18 12:46 ` David Marchand
2016-05-19 2:25 ` Tan, Jianfeng
2016-06-30 13:43 ` Thomas Monjalon
2016-07-01 0:52 ` Tan, Jianfeng
2016-04-26 12:39 ` Tan, Jianfeng
2016-03-04 10:58 ` Jianfeng Tan [this message]
2016-03-08 1:42 ` [dpdk-dev] [PATCH v2] eal: make hugetlb initialization more robust Jianfeng Tan
2016-03-08 8:46 ` Tan, Jianfeng
2016-05-04 11:07 ` Sergio Gonzalez Monroy
2016-05-04 11:28 ` Tan, Jianfeng
2016-05-04 12:25 ` Sergio Gonzalez Monroy
2016-05-09 10:48 ` [dpdk-dev] [PATCH v3] " Jianfeng Tan
2016-05-10 8:54 ` Sergio Gonzalez Monroy
2016-05-10 9:11 ` Tan, Jianfeng
2016-05-12 0:44 ` [dpdk-dev] [PATCH v4] " Jianfeng Tan
2016-05-17 16:39 ` David Marchand
2016-05-18 7:56 ` Sergio Gonzalez Monroy
2016-05-18 9:34 ` David Marchand
2016-05-19 2:00 ` Tan, Jianfeng
2016-05-17 16:40 ` Thomas Monjalon
2016-05-18 8:06 ` Sergio Gonzalez Monroy
2016-05-18 9:38 ` David Marchand
2016-05-19 2:11 ` Tan, Jianfeng
2016-05-31 3:37 ` [dpdk-dev] [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
2016-06-06 2:49 ` Pei, Yulong
2016-06-08 11:27 ` Sergio Gonzalez Monroy
2016-06-30 13:34 ` Thomas Monjalon
2016-08-31 3:07 ` [dpdk-dev] [PATCH v2] eal: restrict cores detection Jianfeng Tan
2016-08-31 15:30 ` Stephen Hemminger
2016-09-01 1:15 ` Tan, Jianfeng
2016-09-01 1:31 ` [dpdk-dev] [PATCH v3] " Jianfeng Tan
2016-09-02 16:53 ` Bruce Richardson
2016-09-16 14:04 ` Thomas Monjalon
2016-09-16 14:02 ` Thomas Monjalon
2016-12-02 17:48 ` [dpdk-dev] [PATCH v4] eal: restrict cores auto detection Jianfeng Tan
2016-12-08 18:19 ` Thomas Monjalon
2016-12-09 15:14 ` Bruce Richardson
2016-12-21 14:31 ` Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1457089092-4128-1-git-send-email-jianfeng.tan@intel.com \
--to=jianfeng.tan@intel.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).