* [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux
@ 2017-06-18 15:52 Gregory Etelson
2017-06-21 15:49 ` Sergio Gonzalez Monroy
0 siblings, 1 reply; 5+ messages in thread
From: Gregory Etelson @ 2017-06-18 15:52 UTC (permalink / raw)
To: dev
Use get_mempolicy(2) to find numa socket on Linux
instead of parsing /proc/self/numa_maps.
When process maps around 1K hugepages
numa_maps file can miss huge records in older Linux kernels.
get_mempolicy() proved more reliable
Requires numactl dev package
Signed-off-by: Gregory Etelson <gregory@weka.io>
---
lib/librte_eal/linuxapp/eal/eal_memory.c | 85 ++++----------------------------
mk/exec-env/linuxapp/rte.vars.mk | 1 +
mk/internal/rte.build-pre.mk | 1 +
3 files changed, 11 insertions(+), 76 deletions(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index e17c9cb..48b71bc 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -54,6 +54,7 @@
#include <sys/time.h>
#include <signal.h>
#include <setjmp.h>
+#include <numaif.h>
#include <rte_log.h>
#include <rte_memory.h>
@@ -495,86 +496,20 @@ unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info
return 0;
}
-/*
- * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
- * page.
- */
static int
find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
{
- int socket_id;
- char *end, *nodestr;
- unsigned i, hp_count = 0;
- uint64_t virt_addr;
- char buf[BUFSIZ];
- char hugedir_str[PATH_MAX];
- FILE *f;
-
- f = fopen("/proc/self/numa_maps", "r");
- if (f == NULL) {
- RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps,"
- " consider that all memory is in socket_id 0\n");
- return 0;
- }
-
- snprintf(hugedir_str, sizeof(hugedir_str),
- "%s/%s", hpi->hugedir, internal_config.hugefile_prefix);
-
- /* parse numa map */
- while (fgets(buf, sizeof(buf), f) != NULL) {
-
- /* ignore non huge page */
- if (strstr(buf, " huge ") == NULL &&
- strstr(buf, hugedir_str) == NULL)
- continue;
-
- /* get zone addr */
- virt_addr = strtoull(buf, &end, 16);
- if (virt_addr == 0 || end == buf) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
-
- /* get node id (socket id) */
- nodestr = strstr(buf, " N");
- if (nodestr == NULL) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
- nodestr += 2;
- end = strstr(nodestr, "=");
- if (end == NULL) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
- end[0] = '\0';
- end = NULL;
-
- socket_id = strtoul(nodestr, &end, 0);
- if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
-
- /* if we find this page in our mappings, set socket_id */
- for (i = 0; i < hpi->num_pages[0]; i++) {
- void *va = (void *)(unsigned long)virt_addr;
- if (hugepg_tbl[i].orig_va == va) {
- hugepg_tbl[i].socket_id = socket_id;
- hp_count++;
- }
+ unsigned int i;
+ for (i = 0; i < hpi->num_pages[0]; i++) {
+ if (get_mempolicy(&hugepg_tbl[i].socket_id,
+ NULL, 0, hugepg_tbl[i].orig_va,
+ MPOL_F_NODE | MPOL_F_ADDR) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to find NUMA socket for %p\n",
+ hugepg_tbl[i].orig_va);
+ return -1;
}
}
-
- if (hp_count < hpi->num_pages[0])
- goto error;
-
- fclose(f);
return 0;
-
-error:
- fclose(f);
- return -1;
}
static int
@@ -1051,8 +986,6 @@ rte_eal_hugepage_init(void)
}
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
- RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
- (unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
diff --git a/mk/exec-env/linuxapp/rte.vars.mk b/mk/exec-env/linuxapp/rte.vars.mk
index 9a71699..0da977a 100644
--- a/mk/exec-env/linuxapp/rte.vars.mk
+++ b/mk/exec-env/linuxapp/rte.vars.mk
@@ -59,5 +59,6 @@ LINK_USING_CC := 1
EXECENV_LDFLAGS += -export-dynamic
# Add library to the group to resolve symbols
EXECENV_LDLIBS += -ldl
+EXECENV_LDLIBS += -lnuma
export EXECENV_CFLAGS EXECENV_LDFLAGS EXECENV_ASFLAGS EXECENV_LDLIBS
diff --git a/mk/internal/rte.build-pre.mk b/mk/internal/rte.build-pre.mk
index 560cf82..9bc0ffd 100644
--- a/mk/internal/rte.build-pre.mk
+++ b/mk/internal/rte.build-pre.mk
@@ -29,6 +29,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+LDLIBS += -lnuma
_BUILD_TARGETS := _prebuild _build _postbuild
comma := ,
--
2.9.3
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux
2017-06-18 15:52 [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux Gregory Etelson
@ 2017-06-21 15:49 ` Sergio Gonzalez Monroy
2017-07-01 14:18 ` Thomas Monjalon
0 siblings, 1 reply; 5+ messages in thread
From: Sergio Gonzalez Monroy @ 2017-06-21 15:49 UTC (permalink / raw)
To: dev
Cc: Thomas Monjalon, Ilya Maximets, Richardson, Bruce, Jerin Jacob,
hemant.agrawal
On 18/06/2017 16:52, Gregory Etelson wrote:
> Use get_mempolicy(2) to find numa socket on Linux
> instead of parsing /proc/self/numa_maps.
> When process maps around 1K hugepages
> numa_maps file can miss huge records in older Linux kernels.
> get_mempolicy() proved more reliable
>
> Requires numactl dev package
>
> Signed-off-by: Gregory Etelson <gregory@weka.io>
> ---
> lib/librte_eal/linuxapp/eal/eal_memory.c | 85 ++++----------------------------
> mk/exec-env/linuxapp/rte.vars.mk | 1 +
> mk/internal/rte.build-pre.mk | 1 +
> 3 files changed, 11 insertions(+), 76 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index e17c9cb..48b71bc 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -54,6 +54,7 @@
> #include <sys/time.h>
> #include <signal.h>
> #include <setjmp.h>
> +#include <numaif.h>
>
> #include <rte_log.h>
> #include <rte_memory.h>
> @@ -495,86 +496,20 @@ unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info
> return 0;
> }
>
> -/*
> - * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
> - * page.
> - */
> static int
> find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
> {
> - int socket_id;
> - char *end, *nodestr;
> - unsigned i, hp_count = 0;
> - uint64_t virt_addr;
> - char buf[BUFSIZ];
> - char hugedir_str[PATH_MAX];
> - FILE *f;
> -
> - f = fopen("/proc/self/numa_maps", "r");
> - if (f == NULL) {
> - RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps,"
> - " consider that all memory is in socket_id 0\n");
> - return 0;
> - }
> -
> - snprintf(hugedir_str, sizeof(hugedir_str),
> - "%s/%s", hpi->hugedir, internal_config.hugefile_prefix);
> -
> - /* parse numa map */
> - while (fgets(buf, sizeof(buf), f) != NULL) {
> -
> - /* ignore non huge page */
> - if (strstr(buf, " huge ") == NULL &&
> - strstr(buf, hugedir_str) == NULL)
> - continue;
> -
> - /* get zone addr */
> - virt_addr = strtoull(buf, &end, 16);
> - if (virt_addr == 0 || end == buf) {
> - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
> - goto error;
> - }
> -
> - /* get node id (socket id) */
> - nodestr = strstr(buf, " N");
> - if (nodestr == NULL) {
> - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
> - goto error;
> - }
> - nodestr += 2;
> - end = strstr(nodestr, "=");
> - if (end == NULL) {
> - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
> - goto error;
> - }
> - end[0] = '\0';
> - end = NULL;
> -
> - socket_id = strtoul(nodestr, &end, 0);
> - if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
> - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
> - goto error;
> - }
> -
> - /* if we find this page in our mappings, set socket_id */
> - for (i = 0; i < hpi->num_pages[0]; i++) {
> - void *va = (void *)(unsigned long)virt_addr;
> - if (hugepg_tbl[i].orig_va == va) {
> - hugepg_tbl[i].socket_id = socket_id;
> - hp_count++;
> - }
> + unsigned int i;
> + for (i = 0; i < hpi->num_pages[0]; i++) {
> + if (get_mempolicy(&hugepg_tbl[i].socket_id,
> + NULL, 0, hugepg_tbl[i].orig_va,
> + MPOL_F_NODE | MPOL_F_ADDR) < 0) {
> + RTE_LOG(ERR, EAL, "Failed to find NUMA socket for %p\n",
> + hugepg_tbl[i].orig_va);
> + return -1;
> }
> }
> -
> - if (hp_count < hpi->num_pages[0])
> - goto error;
> -
> - fclose(f);
> return 0;
> -
> -error:
> - fclose(f);
> - return -1;
> }
>
> static int
> @@ -1051,8 +986,6 @@ rte_eal_hugepage_init(void)
> }
>
> if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
> - RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
> - (unsigned)(hpi->hugepage_sz / 0x100000));
> goto fail;
> }
>
> diff --git a/mk/exec-env/linuxapp/rte.vars.mk b/mk/exec-env/linuxapp/rte.vars.mk
> index 9a71699..0da977a 100644
> --- a/mk/exec-env/linuxapp/rte.vars.mk
> +++ b/mk/exec-env/linuxapp/rte.vars.mk
> @@ -59,5 +59,6 @@ LINK_USING_CC := 1
> EXECENV_LDFLAGS += -export-dynamic
> # Add library to the group to resolve symbols
> EXECENV_LDLIBS += -ldl
> +EXECENV_LDLIBS += -lnuma
>
> export EXECENV_CFLAGS EXECENV_LDFLAGS EXECENV_ASFLAGS EXECENV_LDLIBS
> diff --git a/mk/internal/rte.build-pre.mk b/mk/internal/rte.build-pre.mk
> index 560cf82..9bc0ffd 100644
> --- a/mk/internal/rte.build-pre.mk
> +++ b/mk/internal/rte.build-pre.mk
> @@ -29,6 +29,7 @@
> # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>
> +LDLIBS += -lnuma
> _BUILD_TARGETS := _prebuild _build _postbuild
>
> comma := ,
I think following the discussion of libnuma dependency in another thread
[1], you would need to implement a similar approach and keep the old
method while providing this alternative if libnuma is present.
As Ilya mentions in the thread, this is usually the job of tools such as
autotools, cmake or meson but given that we do not have such tools in
DPDK yet, we rely in a build time config option for libnuma.
Given that we already have a libnuma config option for VHOST, we might
be adding anew one for hugepage balancing, I think it would make sense
to just have single CONFIG_RTE_LIBNUMA option instead.
Thoughts?
Thanks,
Sergio
[1] http://dpdk.org/ml/archives/dev/2017-June/067298.html
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux
2017-06-21 15:49 ` Sergio Gonzalez Monroy
@ 2017-07-01 14:18 ` Thomas Monjalon
2017-07-03 14:11 ` Sergio Gonzalez Monroy
0 siblings, 1 reply; 5+ messages in thread
From: Thomas Monjalon @ 2017-07-01 14:18 UTC (permalink / raw)
To: Sergio Gonzalez Monroy, Gregory Etelson
Cc: dev, Ilya Maximets, Richardson, Bruce, Jerin Jacob, hemant.agrawal
21/06/2017 17:49, Sergio Gonzalez Monroy:
> I think following the discussion of libnuma dependency in another thread
> [1], you would need to implement a similar approach and keep the old
> method while providing this alternative if libnuma is present.
Considering that libnuma becomes mandatory to build DPDK on NUMA-capable
systems, we can assume there is only one CPU socket if libnuma is
unavailable.
> As Ilya mentions in the thread, this is usually the job of tools such as
> autotools, cmake or meson but given that we do not have such tools in
> DPDK yet, we rely in a build time config option for libnuma.
>
> Given that we already have a libnuma config option for VHOST, we might
> be adding anew one for hugepage balancing, I think it would make sense
> to just have single CONFIG_RTE_LIBNUMA option instead.
>
> Thoughts?
I am not sure about merging every NUMA options into one.
It may be interesting to track different NUMA features requiring libnuma.
Anyway, this patch needs to be rebased now that Ilya's patch is applied.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux
2017-07-01 14:18 ` Thomas Monjalon
@ 2017-07-03 14:11 ` Sergio Gonzalez Monroy
2017-07-03 14:20 ` Thomas Monjalon
0 siblings, 1 reply; 5+ messages in thread
From: Sergio Gonzalez Monroy @ 2017-07-03 14:11 UTC (permalink / raw)
To: Thomas Monjalon, Gregory Etelson
Cc: dev, Ilya Maximets, Richardson, Bruce, Jerin Jacob, hemant.agrawal
On 01/07/2017 15:18, Thomas Monjalon wrote:
> 21/06/2017 17:49, Sergio Gonzalez Monroy:
>> I think following the discussion of libnuma dependency in another thread
>> [1], you would need to implement a similar approach and keep the old
>> method while providing this alternative if libnuma is present.
> Considering that libnuma becomes mandatory to build DPDK on NUMA-capable
> systems, we can assume there is only one CPU socket if libnuma is
> unavailable.
>
>> As Ilya mentions in the thread, this is usually the job of tools such as
>> autotools, cmake or meson but given that we do not have such tools in
>> DPDK yet, we rely in a build time config option for libnuma.
>>
>> Given that we already have a libnuma config option for VHOST, we might
>> be adding anew one for hugepage balancing, I think it would make sense
>> to just have single CONFIG_RTE_LIBNUMA option instead.
>>
>> Thoughts?
> I am not sure about merging every NUMA options into one.
> It may be interesting to track different NUMA features requiring libnuma.
Do you see the case where a system with libnuma we want to use the
support for hugapage allocation but not for vhost?
In my opinion this looks like a manual check for libnuma where with a
different build system would have been automated.
Cheers,
Sergio
> Anyway, this patch needs to be rebased now that Ilya's patch is applied.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux
2017-07-03 14:11 ` Sergio Gonzalez Monroy
@ 2017-07-03 14:20 ` Thomas Monjalon
0 siblings, 0 replies; 5+ messages in thread
From: Thomas Monjalon @ 2017-07-03 14:20 UTC (permalink / raw)
To: Sergio Gonzalez Monroy
Cc: Gregory Etelson, dev, Ilya Maximets, Richardson, Bruce,
Jerin Jacob, hemant.agrawal
03/07/2017 16:11, Sergio Gonzalez Monroy:
> On 01/07/2017 15:18, Thomas Monjalon wrote:
> > 21/06/2017 17:49, Sergio Gonzalez Monroy:
> >> I think following the discussion of libnuma dependency in another thread
> >> [1], you would need to implement a similar approach and keep the old
> >> method while providing this alternative if libnuma is present.
> > Considering that libnuma becomes mandatory to build DPDK on NUMA-capable
> > systems, we can assume there is only one CPU socket if libnuma is
> > unavailable.
> >
> >> As Ilya mentions in the thread, this is usually the job of tools such as
> >> autotools, cmake or meson but given that we do not have such tools in
> >> DPDK yet, we rely in a build time config option for libnuma.
> >>
> >> Given that we already have a libnuma config option for VHOST, we might
> >> be adding anew one for hugepage balancing, I think it would make sense
> >> to just have single CONFIG_RTE_LIBNUMA option instead.
> >>
> >> Thoughts?
> > I am not sure about merging every NUMA options into one.
> > It may be interesting to track different NUMA features requiring libnuma.
>
> Do you see the case where a system with libnuma we want to use the
> support for hugapage allocation but not for vhost?
> In my opinion this looks like a manual check for libnuma where with a
> different build system would have been automated.
I think it is not related to the build system.
We could have an automatic check for libnuma which enables several options.
I'm just advocating to keep the different uses of libnuma clearly
advertised in order to simply find what are the features enabled
with this dependency.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2017-07-03 14:20 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-18 15:52 [dpdk-dev] [PATCH] eal: use get_mempolicy(2) to find numa socket on Linux Gregory Etelson
2017-06-21 15:49 ` Sergio Gonzalez Monroy
2017-07-01 14:18 ` Thomas Monjalon
2017-07-03 14:11 ` Sergio Gonzalez Monroy
2017-07-03 14:20 ` Thomas Monjalon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).