patches for DPDK stable branches
 help / color / mirror / Atom feed
* [dpdk-stable] [PATCH] mem: Add Power9 support in rte_eal_hugepage_init
@ 2018-11-09 19:28 David Wilder
  2018-11-10  2:59 ` dwilder
  0 siblings, 1 reply; 2+ messages in thread
From: David Wilder @ 2018-11-09 19:28 UTC (permalink / raw)
  To: stable; +Cc: pradeep, chaozhu

Determine if the ppc64 platform is Power9 or Power8 and perform huge
page mapping appropriately for the selected platform.

Signed-off-by: Pradeep Satyanarayana <pradeep@us.ibm.com>
Tested-by: David Wilder <wilder@us.ibm.com>
---
On IBM Power8, when mmaping hugepage files the address hint supplied to mmap
is not always honored, therefor we let the kernel pick the address by
specifying a NULL address hint. On Power9 the address hint is honored as
expected. This patch detects the platform, if Power9 the address hint is
supplied to mmap and the pages are sorted appropriately. Hugepage mapping for
both primary and secondary processes now work correctly on Power9. I have
retain the original behavior and limitations on Power8. Additionally the flags
supplied to mmap() have been corrected eliminating the message "Cannot get
a virtual area" messages previously seen during EAL init on Power.

 lib/librte_eal/linuxapp/eal/eal_memory.c | 75 +++++++++++++++++-------
 1 file changed, 54 insertions(+), 21 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index bac969a12..5b7001be8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -50,6 +50,9 @@
 #include <limits.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#ifdef RTE_ARCH_PPC_64
+#include <sys/auxv.h>
+#endif
 #include <signal.h>
 #include <setjmp.h>
 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
@@ -107,6 +110,10 @@ static uint64_t baseaddr = 0x100000000;
 
 static bool phys_addrs_available = true;
 
+#ifdef RTE_ARCH_PPC_64
+static int p8;
+#endif
+
 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
 
 static void
@@ -309,12 +316,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 		addr_hint = get_addr_hint();
 
 		addr = mmap(addr_hint,
-				(*size) + hugepage_sz, PROT_READ,
-#ifdef RTE_ARCH_PPC_64
-				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
-				MAP_PRIVATE,
-#endif
+				(*size) + hugepage_sz, PROT_READ, MAP_PRIVATE,
 				fd, 0);
 		if (addr == MAP_FAILED) {
 			/* map failed. Let's try with less memory */
@@ -501,6 +503,15 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 			 * vma_len. If it fails, vma_addr is NULL, so
 			 * let the kernel provide the address. */
 			vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
+#ifdef RTE_ARCH_PPC_64
+			/*
+			 * On power8 the address hint is not consistently
+			 * honored, therefor we always let the
+			 * kernel provide the address.
+			 */
+			if (p8)
+				vma_addr = NULL;
+#endif
 			if (vma_addr == NULL)
 				vma_len = hugepage_sz;
 		}
@@ -1059,6 +1070,23 @@ rte_eal_hugepage_init(void)
 	int nr_hugefiles, nr_hugepages = 0;
 	void *addr;
 
+#ifdef RTE_ARCH_PPC_64
+	char *platform;
+	platform = (char *)getauxval(AT_BASE_PLATFORM);
+
+	p8 = 0;
+
+	/* Alert the user in case our assumptions are incorrect */
+	if (platform == NULL)
+		printf("Some distros on P9 do not support "
+			"getauxval(AT_BASE_PLATFORM). Assuming P9\n");
+
+	if (platform && !strncmp(platform, "power8", 6)) {
+		RTE_LOG(DEBUG, EAL, "This must be a P8\n");
+		p8 = 1;
+	} else
+		RTE_LOG(DEBUG, EAL, "This must be a P9\n");
+#endif
 	test_phys_addrs_available();
 
 	memset(used_hp, 0, sizeof(used_hp));
@@ -1305,14 +1333,22 @@ rte_eal_hugepage_init(void)
 			new_memseg = 1;
 
 #ifdef RTE_ARCH_PPC_64
-		/* On PPC64 architecture, the mmap always start from higher
-		 * virtual address to lower address. Here, both the physical
-		 * address and virtual address are in descending order */
+		/*
+		 * On power8 we let the kernel selected the virtual address
+		 * for mmaped segments, successive mmaps will start from
+		 * higher virtual address to lower address. Physical address
+		 * are in descending order for both platforms.
+		 */
 		else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
 		    hugepage[i].size)
 			new_memseg = 1;
-		else if (((unsigned long)hugepage[i-1].final_va -
-		    (unsigned long)hugepage[i].final_va) != hugepage[i].size)
+		else if ((((unsigned long)hugepage[i-1].final_va -
+		    (unsigned long)hugepage[i].final_va) !=
+		    hugepage[i].size) && (p8))
+			new_memseg = 1;
+		else if ((((unsigned long)hugepage[i].final_va -
+		    (unsigned long)hugepage[i-1].final_va) !=
+		    hugepage[i].size) && (!p8))
 			new_memseg = 1;
 #else
 		else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
@@ -1338,9 +1374,12 @@ rte_eal_hugepage_init(void)
 		else {
 #ifdef RTE_ARCH_PPC_64
 		/* Use the phy and virt address of the last page as segment
-		 * address for IBM Power architecture */
-			mcfg->memseg[j].iova = hugepage[i].physaddr;
-			mcfg->memseg[j].addr = hugepage[i].final_va;
+		 * address for IBM Power8 architecture.
+		 */
+			if (p8) {
+				mcfg->memseg[j].iova = hugepage[i].physaddr;
+				mcfg->memseg[j].addr = hugepage[i].final_va;
+			}
 #endif
 			mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
 		}
@@ -1437,13 +1476,7 @@ rte_eal_hugepage_attach(void)
 		 * use mmap to get identical addresses as the primary process.
 		 */
 		base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
-				 PROT_READ,
-#ifdef RTE_ARCH_PPC_64
-				 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
-				 MAP_PRIVATE,
-#endif
-				 fd_zero, 0);
+				 PROT_READ, MAP_PRIVATE, fd_zero, 0);
 		if (base_addr == MAP_FAILED ||
 		    base_addr != mcfg->memseg[s].addr) {
 			max_seg = s;
-- 
2.19.1

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [dpdk-stable] [PATCH] mem: Add Power9 support in rte_eal_hugepage_init
  2018-11-09 19:28 [dpdk-stable] [PATCH] mem: Add Power9 support in rte_eal_hugepage_init David Wilder
@ 2018-11-10  2:59 ` dwilder
  0 siblings, 0 replies; 2+ messages in thread
From: dwilder @ 2018-11-10  2:59 UTC (permalink / raw)
  To: David Wilder; +Cc: stable, pradeep, chaozhu

On 2018-11-09 11:28, David Wilder wrote:
> Determine if the ppc64 platform is Power9 or Power8 and perform huge
> page mapping appropriately for the selected platform.
> 
> Signed-off-by: Pradeep Satyanarayana <pradeep@us.ibm.com>
> Tested-by: David Wilder <wilder@us.ibm.com>
> ---
> On IBM Power8, when mmaping hugepage files the address hint supplied to 
> mmap
> is not always honored, therefor we let the kernel pick the address by
> specifying a NULL address hint. On Power9 the address hint is honored 
> as
> expected. This patch detects the platform, if Power9 the address hint 
> is
> supplied to mmap and the pages are sorted appropriately. Hugepage 
> mapping for
> both primary and secondary processes now work correctly on Power9. I 
> have
> retain the original behavior and limitations on Power8. Additionally 
> the flags
> supplied to mmap() have been corrected eliminating the message "Cannot 
> get
> a virtual area" messages previously seen during EAL init on Power.
> 
>  lib/librte_eal/linuxapp/eal/eal_memory.c | 75 +++++++++++++++++-------
>  1 file changed, 54 insertions(+), 21 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index bac969a12..5b7001be8 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -50,6 +50,9 @@
>  #include <limits.h>
>  #include <sys/ioctl.h>
>  #include <sys/time.h>
> +#ifdef RTE_ARCH_PPC_64
> +#include <sys/auxv.h>
> +#endif
>  #include <signal.h>
>  #include <setjmp.h>
>  #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> @@ -107,6 +110,10 @@ static uint64_t baseaddr = 0x100000000;
> 
>  static bool phys_addrs_available = true;
> 
> +#ifdef RTE_ARCH_PPC_64
> +static int p8;
> +#endif
> +
>  #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
> 
>  static void
> @@ -309,12 +316,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
>  		addr_hint = get_addr_hint();
> 
>  		addr = mmap(addr_hint,
> -				(*size) + hugepage_sz, PROT_READ,
> -#ifdef RTE_ARCH_PPC_64
> -				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
> -#else
> -				MAP_PRIVATE,
> -#endif
> +				(*size) + hugepage_sz, PROT_READ, MAP_PRIVATE,
>  				fd, 0);
>  		if (addr == MAP_FAILED) {
>  			/* map failed. Let's try with less memory */
> @@ -501,6 +503,15 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl, struct hugepage_info *hpi,
>  			 * vma_len. If it fails, vma_addr is NULL, so
>  			 * let the kernel provide the address. */
>  			vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
> +#ifdef RTE_ARCH_PPC_64
> +			/*
> +			 * On power8 the address hint is not consistently
> +			 * honored, therefor we always let the
> +			 * kernel provide the address.
> +			 */
> +			if (p8)
> +				vma_addr = NULL;
> +#endif
>  			if (vma_addr == NULL)
>  				vma_len = hugepage_sz;
>  		}
> @@ -1059,6 +1070,23 @@ rte_eal_hugepage_init(void)
>  	int nr_hugefiles, nr_hugepages = 0;
>  	void *addr;
> 
> +#ifdef RTE_ARCH_PPC_64
> +	char *platform;
> +	platform = (char *)getauxval(AT_BASE_PLATFORM);
> +
> +	p8 = 0;
> +
> +	/* Alert the user in case our assumptions are incorrect */
> +	if (platform == NULL)
> +		printf("Some distros on P9 do not support "
> +			"getauxval(AT_BASE_PLATFORM). Assuming P9\n");
> +
> +	if (platform && !strncmp(platform, "power8", 6)) {
> +		RTE_LOG(DEBUG, EAL, "This must be a P8\n");
> +		p8 = 1;
> +	} else
> +		RTE_LOG(DEBUG, EAL, "This must be a P9\n");
> +#endif
>  	test_phys_addrs_available();
> 
>  	memset(used_hp, 0, sizeof(used_hp));
> @@ -1305,14 +1333,22 @@ rte_eal_hugepage_init(void)
>  			new_memseg = 1;
> 
>  #ifdef RTE_ARCH_PPC_64
> -		/* On PPC64 architecture, the mmap always start from higher
> -		 * virtual address to lower address. Here, both the physical
> -		 * address and virtual address are in descending order */
> +		/*
> +		 * On power8 we let the kernel selected the virtual address
> +		 * for mmaped segments, successive mmaps will start from
> +		 * higher virtual address to lower address. Physical address
> +		 * are in descending order for both platforms.
> +		 */
>  		else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
>  		    hugepage[i].size)
>  			new_memseg = 1;
> -		else if (((unsigned long)hugepage[i-1].final_va -
> -		    (unsigned long)hugepage[i].final_va) != hugepage[i].size)
> +		else if ((((unsigned long)hugepage[i-1].final_va -
> +		    (unsigned long)hugepage[i].final_va) !=
> +		    hugepage[i].size) && (p8))
> +			new_memseg = 1;
> +		else if ((((unsigned long)hugepage[i].final_va -
> +		    (unsigned long)hugepage[i-1].final_va) !=
> +		    hugepage[i].size) && (!p8))
>  			new_memseg = 1;
>  #else
>  		else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
> @@ -1338,9 +1374,12 @@ rte_eal_hugepage_init(void)
>  		else {
>  #ifdef RTE_ARCH_PPC_64
>  		/* Use the phy and virt address of the last page as segment
> -		 * address for IBM Power architecture */
> -			mcfg->memseg[j].iova = hugepage[i].physaddr;
> -			mcfg->memseg[j].addr = hugepage[i].final_va;
> +		 * address for IBM Power8 architecture.
> +		 */
> +			if (p8) {
> +				mcfg->memseg[j].iova = hugepage[i].physaddr;
> +				mcfg->memseg[j].addr = hugepage[i].final_va;
> +			}
>  #endif
>  			mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
>  		}
> @@ -1437,13 +1476,7 @@ rte_eal_hugepage_attach(void)
>  		 * use mmap to get identical addresses as the primary process.
>  		 */
>  		base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
> -				 PROT_READ,
> -#ifdef RTE_ARCH_PPC_64
> -				 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
> -#else
> -				 MAP_PRIVATE,
> -#endif
> -				 fd_zero, 0);
> +				 PROT_READ, MAP_PRIVATE, fd_zero, 0);
>  		if (base_addr == MAP_FAILED ||
>  		    base_addr != mcfg->memseg[s].addr) {
>  			max_seg = s;

Sorry, This breaks Chao's workaround that makes memory initialization 
for the second process work on power8 (setting nr_hugepages and 
nr_overcommit_hugepages)  I need to make the mmap flags change 
conditional on power8/9.  I am working on a v2 patch.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-11-10  2:57 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-09 19:28 [dpdk-stable] [PATCH] mem: Add Power9 support in rte_eal_hugepage_init David Wilder
2018-11-10  2:59 ` dwilder

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).