From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga07.intel.com (mga07.intel.com [134.134.136.100]) by dpdk.org (Postfix) with ESMTP id D45675F18 for ; Wed, 7 Mar 2018 17:57:16 +0100 (CET) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga105.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 07 Mar 2018 08:57:15 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.47,436,1515484800"; d="scan'208";a="209599923" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by fmsmga005.fm.intel.com with ESMTP; 07 Mar 2018 08:57:12 -0800 Received: from sivswdev01.ir.intel.com (sivswdev01.ir.intel.com [10.237.217.45]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id w27GvBpl032392; Wed, 7 Mar 2018 16:57:11 GMT Received: from sivswdev01.ir.intel.com (localhost [127.0.0.1]) by sivswdev01.ir.intel.com with ESMTP id w27GvB7f006728; Wed, 7 Mar 2018 16:57:11 GMT Received: (from aburakov@localhost) by sivswdev01.ir.intel.com with LOCAL id w27GvB7X006724; Wed, 7 Mar 2018 16:57:11 GMT From: Anatoly Burakov To: dev@dpdk.org Cc: keith.wiles@intel.com, jianfeng.tan@intel.com, andras.kovacs@ericsson.com, laszlo.vadkeri@ericsson.com, benjamin.walker@intel.com, bruce.richardson@intel.com, thomas@monjalon.net, konstantin.ananyev@intel.com, kuralamudhan.ramakrishnan@intel.com, louise.m.daly@intel.com, nelio.laranjeiro@6wind.com, yskoh@mellanox.com, pepperjo@japf.ch, jerin.jacob@caviumnetworks.com, hemant.agrawal@nxp.com, olivier.matz@6wind.com Date: Wed, 7 Mar 2018 16:56:44 +0000 Message-Id: X-Mailer: git-send-email 1.7.0.7 In-Reply-To: References: In-Reply-To: References: Subject: [dpdk-dev] [PATCH v2 16/41] eal: make use of memory hotplug for init X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 07 Mar 2018 16:57:17 -0000 Add a new (non-legacy) memory init path for EAL. It uses the new memory hotplug facilities, although it's only being run at startup. If no -m or --socket-mem switches were specified, the new init will not allocate anything, whereas if those switches were passed, appropriate amounts of pages would be requested, just like for legacy init. Since rte_malloc support for dynamic allocation comes in later patches, running DPDK without --socket-mem or -m switches will fail in this patch. Also, allocated pages will be physically discontiguous (or rather, they're not guaranteed to be physically contiguous - they may still be, by accident) unless IOVA_AS_VA mode is used. Since memory hotplug subsystem relies on partial file locking, replace flock() locks with fcntl() locks. Signed-off-by: Anatoly Burakov --- Notes: This commit shows "the wolrd as it could have been". All of this other monstrous amount of code in eal_memory.c is there because of legacy init option. Do we *really* want to keep it around, and make DPDK init and memory system suffer from split personality? lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 25 ++++++++- lib/librte_eal/linuxapp/eal/eal_memory.c | 74 +++++++++++++++++++++++-- 2 files changed, 92 insertions(+), 7 deletions(-) diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 706b6d5..7e2475f 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -200,6 +201,18 @@ get_hugepage_dir(uint64_t hugepage_sz) } /* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* * Clear the hugepage directory of whatever hugepage files * there are. Checks if the file is locked (i.e. * if it's in use by another DPDK process). @@ -229,6 +242,8 @@ clear_hugedir(const char * hugedir) } while(dirent != NULL){ + struct flock lck = {0}; + /* skip files that don't match the hugepage pattern */ if (fnmatch(filter, dirent->d_name, 0) > 0) { dirent = readdir(dir); @@ -245,11 +260,17 @@ clear_hugedir(const char * hugedir) } /* non-blocking lock */ - lck_result = flock(fd, LOCK_EX | LOCK_NB); + lck.l_type = F_RDLCK; + lck.l_whence = SEEK_SET; + lck.l_start = 0; + lck.l_len = getFileSize(fd); + + lck_result = fcntl(fd, F_SETLK, &lck); /* if lock succeeds, unlock and remove the file */ if (lck_result != -1) { - flock(fd, LOCK_UN); + lck.l_type = F_UNLCK; + fcntl(fd, F_SETLK, &lck); unlinkat(dir_fd, dirent->d_name, 0); } close (fd); diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 9512da9..e0b4988 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -40,6 +40,7 @@ #include #include "eal_private.h" +#include "eal_memalloc.h" #include "eal_internal_cfg.h" #include "eal_filesystem.h" #include "eal_hugepages.h" @@ -260,6 +261,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, void *virtaddr; void *vma_addr = NULL; size_t vma_len = 0; + struct flock lck = {0}; #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES int node_id = -1; int essential_prev = 0; @@ -434,8 +436,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, } - /* set shared flock on the file. */ - if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + /* set shared lock on the file. */ + lck.l_type = F_RDLCK; + lck.l_whence = SEEK_SET; + lck.l_start = 0; + lck.l_len = hugepage_sz; + if (fcntl(fd, F_SETLK, &lck) == -1) { RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", __func__, strerror(errno)); close(fd); @@ -1300,6 +1306,62 @@ eal_legacy_hugepage_init(void) return -1; } +static int +eal_hugepage_init(void) +{ + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + uint64_t memory[RTE_MAX_NUMA_NODES]; + int hp_sz_idx, socket_id; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { + /* also initialize used_hp hugepage sizes in used_hp */ + struct hugepage_info *hpi; + hpi = &internal_config.hugepage_info[hp_sz_idx]; + used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; + } + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) + memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; + + /* calculate final number of pages */ + if (calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes) < 0) + return -1; + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; + socket_id++) { + struct hugepage_info *hpi = &used_hp[hp_sz_idx]; + unsigned int num_pages = hpi->num_pages[socket_id]; + int num_pages_alloc; + + if (num_pages == 0) + continue; + + RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %luM on socket %i\n", + num_pages, hpi->hugepage_sz >> 20, socket_id); + + num_pages_alloc = eal_memalloc_alloc_page_bulk(NULL, + num_pages, + hpi->hugepage_sz, socket_id, + true); + if (num_pages_alloc < 0) + return -1; + } + } + return 0; +} + /* * uses fstat to report the size of a file on disk */ @@ -1510,9 +1572,9 @@ eal_legacy_hugepage_attach(void) int rte_eal_hugepage_init(void) { - if (internal_config.legacy_mem) - return eal_legacy_hugepage_init(); - return -1; + return internal_config.legacy_mem ? + eal_legacy_hugepage_init() : + eal_hugepage_init(); } int @@ -1520,6 +1582,8 @@ rte_eal_hugepage_attach(void) { if (internal_config.legacy_mem) return eal_legacy_hugepage_attach(); + else + RTE_LOG(ERR, EAL, "Secondary processes aren't supported yet\n"); return -1; } -- 2.7.4