From: "Tan, Jianfeng" <jianfeng.tan@intel.com>
To: dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH v2] eal: make hugetlb initialization more robust
Date: Tue, 8 Mar 2016 16:46:55 +0800 [thread overview]
Message-ID: <56DE917F.8090808@intel.com> (raw)
In-Reply-To: <1457401359-132260-1-git-send-email-jianfeng.tan@intel.com>
On 3/8/2016 9:42 AM, Jianfeng Tan wrote:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails
> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
>
> Test example:
> a. cgcreate -g hugetlb:/test-subgroup
> b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
> c. cgexec -g hugetlb:test-subgroup \
> ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Sorry, forgot to add ack from Neil.
Acked-by: Neil Horman <nhorman@tuxdriver.com>
> ---
> v2:
> - Address the compiling error by move setjmp into a wrap method.
>
> lib/librte_eal/common/eal_common_options.c | 4 ++
> lib/librte_eal/common/eal_internal_cfg.h | 1 +
> lib/librte_eal/common/eal_options.h | 2 +
> lib/librte_eal/linuxapp/eal/eal.c | 1 +
> lib/librte_eal/linuxapp/eal/eal_memory.c | 104 ++++++++++++++++++++++++++---
> 5 files changed, 104 insertions(+), 8 deletions(-)
>
> diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
> index 29942ea..8ff6a2e 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
> {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
> {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
> {OPT_XEN_DOM0, 0, NULL, OPT_XEN_DOM0_NUM },
> + {OPT_HUGE_TRYBEST, 0, NULL, OPT_HUGE_TRYBEST_NUM },
> {0, 0, NULL, 0 }
> };
>
> @@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
> return -1;
> }
> break;
> + case OPT_HUGE_TRYBEST_NUM:
> + internal_config.huge_trybest = 1;
> + break;
>
> /* don't know what to do, leave this to caller */
> default:
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..90a3533 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -64,6 +64,7 @@ struct internal_config {
> volatile unsigned force_nchannel; /**< force number of channels */
> volatile unsigned force_nrank; /**< force number of ranks */
> volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */
> + volatile unsigned huge_trybest; /**< try best to allocate hugepages */
> unsigned hugepage_unlink; /**< true to unlink backing files */
> volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
> volatile unsigned no_pci; /**< true to disable PCI */
> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index a881c62..02397c5 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
> OPT_VMWARE_TSC_MAP_NUM,
> #define OPT_XEN_DOM0 "xen-dom0"
> OPT_XEN_DOM0_NUM,
> +#define OPT_HUGE_TRYBEST "huge-trybest"
> + OPT_HUGE_TRYBEST_NUM,
> OPT_LONG_MAX_NUM
> };
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index ceac435..3e23877 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -343,6 +343,7 @@ eal_usage(const char *prgname)
> " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
> " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
> " --"OPT_XEN_DOM0" Support running on Xen dom0 without hugetlbfs\n"
> + " --"OPT_HUGE_TRYBEST" Try best to accommodate hugepages\n"
> "\n");
> /* Allow the application to print its usage message too if hook is set */
> if ( rte_application_usage_hook ) {
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..e4e1f3b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -80,6 +80,8 @@
> #include <errno.h>
> #include <sys/ioctl.h>
> #include <sys/time.h>
> +#include <signal.h>
> +#include <setjmp.h>
>
> #include <rte_log.h>
> #include <rte_memory.h>
> @@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
> return addr;
> }
>
> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> + siglongjmp(jmpenv, 1);
> +}
> +
> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
> + * non-static local variable in the stack frame calling setjmp might be
> + * clobbered by a call to longjmp.
> + */
> +static int wrap_setjmp(void)
> +{
> + return setjmp(jmpenv);
> +}
> /*
> * Mmap all hugepages of hugepage table: it first open a file in
> * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
> if (fd < 0) {
> RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
> strerror(errno));
> - return -1;
> + return i;
> }
>
> /* map the segment, and populate page tables,
> @@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
> RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
> strerror(errno));
> close(fd);
> - return -1;
> + return i;
> }
>
> if (orig) {
> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
> hugepg_tbl[i].final_va = virtaddr;
> }
>
> + if (orig && internal_config.huge_trybest) {
> + /* In linux, hugetlb limitations, like cgroup, are
> + * enforced at fault time instead of mmap(), even
> + * with the option of MAP_POPULATE. Kernel will send
> + * a SIGBUS signal. To avoid to be killed, save stack
> + * environment here, if SIGBUS happens, we can jump
> + * back here.
> + */
> + if (wrap_setjmp()) {
> + RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
> + "hugepages of size %u MB\n",
> + (unsigned)(hugepage_sz / 0x100000));
> + munmap(virtaddr, hugepage_sz);
> + close(fd);
> + unlink(hugepg_tbl[i].filepath);
> + return i;
> + }
> + *(int *)virtaddr = 0;
> + }
> +
> +
> /* set shared flock on the file. */
> if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
> RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
> __func__, strerror(errno));
> close(fd);
> - return -1;
> + return i;
> }
>
> close(fd);
> @@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
> vma_addr = (char *)vma_addr + hugepage_sz;
> vma_len -= hugepage_sz;
> }
> - return 0;
> + return i;
> }
>
> #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> @@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
> return total_num_pages;
> }
>
> +static struct sigaction action_old;
> +static int need_recover;
> +
> +static void
> +register_sigbus(void)
> +{
> + sigset_t mask;
> + struct sigaction action;
> +
> + sigemptyset(&mask);
> + sigaddset(&mask, SIGBUS);
> + action.sa_flags = 0;
> + action.sa_mask = mask;
> + action.sa_handler = sigbus_handler;
> +
> + need_recover = !sigaction(SIGBUS, &action, &action_old);
> +}
> +
> +static void
> +recover_sigbus(void)
> +{
> + if (need_recover) {
> + sigaction(SIGBUS, &action_old, NULL);
> + need_recover = 0;
> + }
> +}
> +
> /*
> * Prepare physical memory mapping: fill configuration structure with
> * these infos, return 0 on success.
> @@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)
>
> hp_offset = 0; /* where we start the current page size entries */
>
> + if (internal_config.huge_trybest)
> + register_sigbus();
> +
> /* map all hugepages and sort them */
> for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
> + int pages_old, pages_new;
> struct hugepage_info *hpi;
>
> /*
> @@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
> continue;
>
> /* map all hugepages available */
> - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
> - RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
> - (unsigned)(hpi->hugepage_sz / 0x100000));
> - goto fail;
> + pages_old = hpi->num_pages[0];
> + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> + if (pages_new < pages_old) {
> + RTE_LOG(DEBUG, EAL,
> + "%d not %d hugepages of size %u MB allocated\n",
> + pages_new, pages_old,
> + (unsigned)(hpi->hugepage_sz / 0x100000));
> + if (internal_config.huge_trybest) {
> + int pages = pages_old - pages_new;
> +
> + internal_config.memory -=
> + hpi->hugepage_sz * pages;
> + nr_hugepages -= pages;
> + hpi->num_pages[0] = pages_new;
> + if (pages_new == 0)
> + continue;
> + } else
> + goto fail;
> }
>
> /* find physical addresses and sockets for each hugepage */
> @@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void)
> #endif
> }
>
> + if (internal_config.huge_trybest)
> + recover_sigbus();
> +
> #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> nr_hugefiles = 0;
> for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
> @@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void)
> return 0;
>
> fail:
> + if (internal_config.huge_trybest)
> + recover_sigbus();
> free(tmp_hp);
> return -1;
> }
next prev parent reply other threads:[~2016-03-08 8:46 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-24 18:49 [dpdk-dev] [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
2016-01-25 13:46 ` Neil Horman
2016-01-26 2:22 ` Tan, Jianfeng
2016-01-26 14:19 ` Neil Horman
2016-01-27 12:02 ` Tan, Jianfeng
2016-01-27 17:30 ` Neil Horman
2016-01-29 11:22 ` [dpdk-dev] [PATCH] eal: make resource initialization more robust Jianfeng Tan
2016-02-01 18:08 ` Neil Horman
2016-02-22 6:08 ` Tan, Jianfeng
2016-02-22 13:18 ` Neil Horman
2016-02-28 21:12 ` Thomas Monjalon
2016-02-29 1:50 ` Tan, Jianfeng
2016-03-04 10:05 ` [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
2016-03-08 8:54 ` Panu Matilainen
2016-03-08 17:38 ` Tan, Jianfeng
2016-03-09 13:05 ` Panu Matilainen
2016-03-09 13:53 ` Tan, Jianfeng
2016-03-09 14:01 ` Ananyev, Konstantin
2016-03-09 14:17 ` Tan, Jianfeng
2016-03-09 14:44 ` Ananyev, Konstantin
2016-03-09 14:55 ` Tan, Jianfeng
2016-03-09 15:17 ` Ananyev, Konstantin
2016-03-09 17:45 ` Tan, Jianfeng
2016-03-09 19:33 ` Ananyev, Konstantin
2016-03-10 1:36 ` Tan, Jianfeng
2016-05-18 12:46 ` David Marchand
2016-05-19 2:25 ` Tan, Jianfeng
2016-06-30 13:43 ` Thomas Monjalon
2016-07-01 0:52 ` Tan, Jianfeng
2016-04-26 12:39 ` Tan, Jianfeng
2016-03-04 10:58 ` [dpdk-dev] [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
2016-03-08 1:42 ` [dpdk-dev] [PATCH v2] " Jianfeng Tan
2016-03-08 8:46 ` Tan, Jianfeng [this message]
2016-05-04 11:07 ` Sergio Gonzalez Monroy
2016-05-04 11:28 ` Tan, Jianfeng
2016-05-04 12:25 ` Sergio Gonzalez Monroy
2016-05-09 10:48 ` [dpdk-dev] [PATCH v3] " Jianfeng Tan
2016-05-10 8:54 ` Sergio Gonzalez Monroy
2016-05-10 9:11 ` Tan, Jianfeng
2016-05-12 0:44 ` [dpdk-dev] [PATCH v4] " Jianfeng Tan
2016-05-17 16:39 ` David Marchand
2016-05-18 7:56 ` Sergio Gonzalez Monroy
2016-05-18 9:34 ` David Marchand
2016-05-19 2:00 ` Tan, Jianfeng
2016-05-17 16:40 ` Thomas Monjalon
2016-05-18 8:06 ` Sergio Gonzalez Monroy
2016-05-18 9:38 ` David Marchand
2016-05-19 2:11 ` Tan, Jianfeng
2016-05-31 3:37 ` [dpdk-dev] [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
2016-06-06 2:49 ` Pei, Yulong
2016-06-08 11:27 ` Sergio Gonzalez Monroy
2016-06-30 13:34 ` Thomas Monjalon
2016-08-31 3:07 ` [dpdk-dev] [PATCH v2] eal: restrict cores detection Jianfeng Tan
2016-08-31 15:30 ` Stephen Hemminger
2016-09-01 1:15 ` Tan, Jianfeng
2016-09-01 1:31 ` [dpdk-dev] [PATCH v3] " Jianfeng Tan
2016-09-02 16:53 ` Bruce Richardson
2016-09-16 14:04 ` Thomas Monjalon
2016-09-16 14:02 ` Thomas Monjalon
2016-12-02 17:48 ` [dpdk-dev] [PATCH v4] eal: restrict cores auto detection Jianfeng Tan
2016-12-08 18:19 ` Thomas Monjalon
2016-12-09 15:14 ` Bruce Richardson
2016-12-21 14:31 ` Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=56DE917F.8090808@intel.com \
--to=jianfeng.tan@intel.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).